1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # 9# Redistribution and use in source and binary forms, with or without # 10# modification, are permitted provided that the following conditions are # 11# met: # 12# # 13# * Redistributions of source code must retain the above copyright # 14# notice, this list of conditions and the following disclaimer. # 15# # 16# * Redistributions in binary form must reproduce the above copyright # 17# notice, this list of conditions and the following disclaimer in the # 18# documentation and/or other materials provided with the # 19# distribution. # 20# # 21# * Neither the name of the Intel Corporation nor the names of its # 22# contributors may be used to endorse or promote products derived from # 23# this software without specific prior written permission. # 24# # 25# # 26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37# # 38############################################################################## 39# Developers and authors: # 40# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41# (1) Intel Corporation, Israel Development Center, Haifa, Israel # 42# (2) University of Haifa, Israel # 43############################################################################## 44# Reference: # 45# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # 46# Exponentiation, Using Advanced Vector Instructions Architectures", # 47# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # 48# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # 49# [2] S. Gueron: "Efficient Software Implementations of Modular # 50# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # 51# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # 52# Proceedings of 9th International Conference on Information Technology: # 53# New Generations (ITNG 2012), pp.821-823 (2012) # 54# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 55# resistant 1024-bit modular exponentiation, for optimizing RSA2048 # 56# on AVX2 capable x86_64 platforms", # 57# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# 58############################################################################## 59# 60# +13% improvement over original submission by <appro@openssl.org> 61# 62# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 63# 2.3GHz Haswell 621 765/+23% 1113/+79% 64# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% 65# 66# (*) if system doesn't support AVX2, for reference purposes; 67# (**) scaled to 2.3GHz to simplify comparison; 68# (***) scalar AD*X code is faster than AVX2 and is preferred code 69# path for Broadwell; 70 71$flavour = shift; 72$output = shift; 73if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 74 75$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 76 77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80die "can't locate x86_64-xlate.pl"; 81 82if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 83 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 84 $avx = ($1>=2.19) + ($1>=2.22); 85 $addx = ($1>=2.23); 86} 87 88if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 89 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 90 $avx = ($1>=2.09) + ($1>=2.10); 91 $addx = ($1>=2.10); 92} 93 94if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 95 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 96 $avx = ($1>=10) + ($1>=11); 97 $addx = ($1>=11); 98} 99 100if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { 101 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 102 $avx = ($ver>=3.0) + ($ver>=3.01); 103 $addx = ($ver>=3.03); 104} 105 106open OUT,"| \"$^X\" $xlate $flavour $output"; 107*STDOUT = *OUT; 108 109if ($avx>1) {{{ 110{ # void AMS_WW( 111my $rp="%rdi"; # BN_ULONG *rp, 112my $ap="%rsi"; # const BN_ULONG *ap, 113my $np="%rdx"; # const BN_ULONG *np, 114my $n0="%ecx"; # const BN_ULONG n0, 115my $rep="%r8d"; # int repeat); 116 117# The registers that hold the accumulated redundant result 118# The AMM works on 1024 bit operands, and redundant word size is 29 119# Therefore: ceil(1024/29)/4 = 9 120my $ACC0="%ymm0"; 121my $ACC1="%ymm1"; 122my $ACC2="%ymm2"; 123my $ACC3="%ymm3"; 124my $ACC4="%ymm4"; 125my $ACC5="%ymm5"; 126my $ACC6="%ymm6"; 127my $ACC7="%ymm7"; 128my $ACC8="%ymm8"; 129my $ACC9="%ymm9"; 130# Registers that hold the broadcasted words of bp, currently used 131my $B1="%ymm10"; 132my $B2="%ymm11"; 133# Registers that hold the broadcasted words of Y, currently used 134my $Y1="%ymm12"; 135my $Y2="%ymm13"; 136# Helper registers 137my $TEMP1="%ymm14"; 138my $AND_MASK="%ymm15"; 139# alu registers that hold the first words of the ACC 140my $r0="%r9"; 141my $r1="%r10"; 142my $r2="%r11"; 143my $r3="%r12"; 144 145my $i="%r14d"; # loop counter 146my $tmp = "%r15"; 147 148my $FrameSize=32*18+32*8; # place for A^2 and 2*A 149 150my $aap=$r0; 151my $tp0="%rbx"; 152my $tp1=$r3; 153my $tpa=$tmp; 154 155$np="%r13"; # reassigned argument 156 157$code.=<<___; 158.text 159 160.globl rsaz_1024_sqr_avx2 161.type rsaz_1024_sqr_avx2,\@function,5 162.align 64 163rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 164 lea (%rsp), %rax 165 push %rbx 166 push %rbp 167 push %r12 168 push %r13 169 push %r14 170 push %r15 171 vzeroupper 172___ 173$code.=<<___ if ($win64); 174 lea -0xa8(%rsp),%rsp 175 vmovaps %xmm6,-0xd8(%rax) 176 vmovaps %xmm7,-0xc8(%rax) 177 vmovaps %xmm8,-0xb8(%rax) 178 vmovaps %xmm9,-0xa8(%rax) 179 vmovaps %xmm10,-0x98(%rax) 180 vmovaps %xmm11,-0x88(%rax) 181 vmovaps %xmm12,-0x78(%rax) 182 vmovaps %xmm13,-0x68(%rax) 183 vmovaps %xmm14,-0x58(%rax) 184 vmovaps %xmm15,-0x48(%rax) 185.Lsqr_1024_body: 186___ 187$code.=<<___; 188 mov %rax,%rbp 189 mov %rdx, $np # reassigned argument 190 sub \$$FrameSize, %rsp 191 mov $np, $tmp 192 sub \$-128, $rp # size optimization 193 sub \$-128, $ap 194 sub \$-128, $np 195 196 and \$4095, $tmp # see if $np crosses page 197 add \$32*10, $tmp 198 shr \$12, $tmp 199 vpxor $ACC9,$ACC9,$ACC9 200 jz .Lsqr_1024_no_n_copy 201 202 # unaligned 256-bit load that crosses page boundary can 203 # cause >2x performance degradation here, so if $np does 204 # cross page boundary, copy it to stack and make sure stack 205 # frame doesn't... 206 sub \$32*10,%rsp 207 vmovdqu 32*0-128($np), $ACC0 208 and \$-2048, %rsp 209 vmovdqu 32*1-128($np), $ACC1 210 vmovdqu 32*2-128($np), $ACC2 211 vmovdqu 32*3-128($np), $ACC3 212 vmovdqu 32*4-128($np), $ACC4 213 vmovdqu 32*5-128($np), $ACC5 214 vmovdqu 32*6-128($np), $ACC6 215 vmovdqu 32*7-128($np), $ACC7 216 vmovdqu 32*8-128($np), $ACC8 217 lea $FrameSize+128(%rsp),$np 218 vmovdqu $ACC0, 32*0-128($np) 219 vmovdqu $ACC1, 32*1-128($np) 220 vmovdqu $ACC2, 32*2-128($np) 221 vmovdqu $ACC3, 32*3-128($np) 222 vmovdqu $ACC4, 32*4-128($np) 223 vmovdqu $ACC5, 32*5-128($np) 224 vmovdqu $ACC6, 32*6-128($np) 225 vmovdqu $ACC7, 32*7-128($np) 226 vmovdqu $ACC8, 32*8-128($np) 227 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 228 229.Lsqr_1024_no_n_copy: 230 and \$-1024, %rsp 231 232 vmovdqu 32*1-128($ap), $ACC1 233 vmovdqu 32*2-128($ap), $ACC2 234 vmovdqu 32*3-128($ap), $ACC3 235 vmovdqu 32*4-128($ap), $ACC4 236 vmovdqu 32*5-128($ap), $ACC5 237 vmovdqu 32*6-128($ap), $ACC6 238 vmovdqu 32*7-128($ap), $ACC7 239 vmovdqu 32*8-128($ap), $ACC8 240 241 lea 192(%rsp), $tp0 # 64+128=192 242 vpbroadcastq .Land_mask(%rip), $AND_MASK 243 jmp .LOOP_GRANDE_SQR_1024 244 245.align 32 246.LOOP_GRANDE_SQR_1024: 247 lea 32*18+128(%rsp), $aap # size optimization 248 lea 448(%rsp), $tp1 # 64+128+256=448 249 250 # the squaring is performed as described in Variant B of 251 # "Speeding up Big-Number Squaring", so start by calculating 252 # the A*2=A+A vector 253 vpaddq $ACC1, $ACC1, $ACC1 254 vpbroadcastq 32*0-128($ap), $B1 255 vpaddq $ACC2, $ACC2, $ACC2 256 vmovdqa $ACC1, 32*0-128($aap) 257 vpaddq $ACC3, $ACC3, $ACC3 258 vmovdqa $ACC2, 32*1-128($aap) 259 vpaddq $ACC4, $ACC4, $ACC4 260 vmovdqa $ACC3, 32*2-128($aap) 261 vpaddq $ACC5, $ACC5, $ACC5 262 vmovdqa $ACC4, 32*3-128($aap) 263 vpaddq $ACC6, $ACC6, $ACC6 264 vmovdqa $ACC5, 32*4-128($aap) 265 vpaddq $ACC7, $ACC7, $ACC7 266 vmovdqa $ACC6, 32*5-128($aap) 267 vpaddq $ACC8, $ACC8, $ACC8 268 vmovdqa $ACC7, 32*6-128($aap) 269 vpxor $ACC9, $ACC9, $ACC9 270 vmovdqa $ACC8, 32*7-128($aap) 271 272 vpmuludq 32*0-128($ap), $B1, $ACC0 273 vpbroadcastq 32*1-128($ap), $B2 274 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 275 vpmuludq $B1, $ACC1, $ACC1 276 vmovdqu $ACC9, 32*10-448($tp1) 277 vpmuludq $B1, $ACC2, $ACC2 278 vmovdqu $ACC9, 32*11-448($tp1) 279 vpmuludq $B1, $ACC3, $ACC3 280 vmovdqu $ACC9, 32*12-448($tp1) 281 vpmuludq $B1, $ACC4, $ACC4 282 vmovdqu $ACC9, 32*13-448($tp1) 283 vpmuludq $B1, $ACC5, $ACC5 284 vmovdqu $ACC9, 32*14-448($tp1) 285 vpmuludq $B1, $ACC6, $ACC6 286 vmovdqu $ACC9, 32*15-448($tp1) 287 vpmuludq $B1, $ACC7, $ACC7 288 vmovdqu $ACC9, 32*16-448($tp1) 289 vpmuludq $B1, $ACC8, $ACC8 290 vpbroadcastq 32*2-128($ap), $B1 291 vmovdqu $ACC9, 32*17-448($tp1) 292 293 mov $ap, $tpa 294 mov \$4, $i 295 jmp .Lsqr_entry_1024 296___ 297$TEMP0=$Y1; 298$TEMP2=$Y2; 299$code.=<<___; 300.align 32 301.LOOP_SQR_1024: 302 vpbroadcastq 32*1-128($tpa), $B2 303 vpmuludq 32*0-128($ap), $B1, $ACC0 304 vpaddq 32*0-192($tp0), $ACC0, $ACC0 305 vpmuludq 32*0-128($aap), $B1, $ACC1 306 vpaddq 32*1-192($tp0), $ACC1, $ACC1 307 vpmuludq 32*1-128($aap), $B1, $ACC2 308 vpaddq 32*2-192($tp0), $ACC2, $ACC2 309 vpmuludq 32*2-128($aap), $B1, $ACC3 310 vpaddq 32*3-192($tp0), $ACC3, $ACC3 311 vpmuludq 32*3-128($aap), $B1, $ACC4 312 vpaddq 32*4-192($tp0), $ACC4, $ACC4 313 vpmuludq 32*4-128($aap), $B1, $ACC5 314 vpaddq 32*5-192($tp0), $ACC5, $ACC5 315 vpmuludq 32*5-128($aap), $B1, $ACC6 316 vpaddq 32*6-192($tp0), $ACC6, $ACC6 317 vpmuludq 32*6-128($aap), $B1, $ACC7 318 vpaddq 32*7-192($tp0), $ACC7, $ACC7 319 vpmuludq 32*7-128($aap), $B1, $ACC8 320 vpbroadcastq 32*2-128($tpa), $B1 321 vpaddq 32*8-192($tp0), $ACC8, $ACC8 322.Lsqr_entry_1024: 323 vmovdqu $ACC0, 32*0-192($tp0) 324 vmovdqu $ACC1, 32*1-192($tp0) 325 326 vpmuludq 32*1-128($ap), $B2, $TEMP0 327 vpaddq $TEMP0, $ACC2, $ACC2 328 vpmuludq 32*1-128($aap), $B2, $TEMP1 329 vpaddq $TEMP1, $ACC3, $ACC3 330 vpmuludq 32*2-128($aap), $B2, $TEMP2 331 vpaddq $TEMP2, $ACC4, $ACC4 332 vpmuludq 32*3-128($aap), $B2, $TEMP0 333 vpaddq $TEMP0, $ACC5, $ACC5 334 vpmuludq 32*4-128($aap), $B2, $TEMP1 335 vpaddq $TEMP1, $ACC6, $ACC6 336 vpmuludq 32*5-128($aap), $B2, $TEMP2 337 vpaddq $TEMP2, $ACC7, $ACC7 338 vpmuludq 32*6-128($aap), $B2, $TEMP0 339 vpaddq $TEMP0, $ACC8, $ACC8 340 vpmuludq 32*7-128($aap), $B2, $ACC0 341 vpbroadcastq 32*3-128($tpa), $B2 342 vpaddq 32*9-192($tp0), $ACC0, $ACC0 343 344 vmovdqu $ACC2, 32*2-192($tp0) 345 vmovdqu $ACC3, 32*3-192($tp0) 346 347 vpmuludq 32*2-128($ap), $B1, $TEMP2 348 vpaddq $TEMP2, $ACC4, $ACC4 349 vpmuludq 32*2-128($aap), $B1, $TEMP0 350 vpaddq $TEMP0, $ACC5, $ACC5 351 vpmuludq 32*3-128($aap), $B1, $TEMP1 352 vpaddq $TEMP1, $ACC6, $ACC6 353 vpmuludq 32*4-128($aap), $B1, $TEMP2 354 vpaddq $TEMP2, $ACC7, $ACC7 355 vpmuludq 32*5-128($aap), $B1, $TEMP0 356 vpaddq $TEMP0, $ACC8, $ACC8 357 vpmuludq 32*6-128($aap), $B1, $TEMP1 358 vpaddq $TEMP1, $ACC0, $ACC0 359 vpmuludq 32*7-128($aap), $B1, $ACC1 360 vpbroadcastq 32*4-128($tpa), $B1 361 vpaddq 32*10-448($tp1), $ACC1, $ACC1 362 363 vmovdqu $ACC4, 32*4-192($tp0) 364 vmovdqu $ACC5, 32*5-192($tp0) 365 366 vpmuludq 32*3-128($ap), $B2, $TEMP0 367 vpaddq $TEMP0, $ACC6, $ACC6 368 vpmuludq 32*3-128($aap), $B2, $TEMP1 369 vpaddq $TEMP1, $ACC7, $ACC7 370 vpmuludq 32*4-128($aap), $B2, $TEMP2 371 vpaddq $TEMP2, $ACC8, $ACC8 372 vpmuludq 32*5-128($aap), $B2, $TEMP0 373 vpaddq $TEMP0, $ACC0, $ACC0 374 vpmuludq 32*6-128($aap), $B2, $TEMP1 375 vpaddq $TEMP1, $ACC1, $ACC1 376 vpmuludq 32*7-128($aap), $B2, $ACC2 377 vpbroadcastq 32*5-128($tpa), $B2 378 vpaddq 32*11-448($tp1), $ACC2, $ACC2 379 380 vmovdqu $ACC6, 32*6-192($tp0) 381 vmovdqu $ACC7, 32*7-192($tp0) 382 383 vpmuludq 32*4-128($ap), $B1, $TEMP0 384 vpaddq $TEMP0, $ACC8, $ACC8 385 vpmuludq 32*4-128($aap), $B1, $TEMP1 386 vpaddq $TEMP1, $ACC0, $ACC0 387 vpmuludq 32*5-128($aap), $B1, $TEMP2 388 vpaddq $TEMP2, $ACC1, $ACC1 389 vpmuludq 32*6-128($aap), $B1, $TEMP0 390 vpaddq $TEMP0, $ACC2, $ACC2 391 vpmuludq 32*7-128($aap), $B1, $ACC3 392 vpbroadcastq 32*6-128($tpa), $B1 393 vpaddq 32*12-448($tp1), $ACC3, $ACC3 394 395 vmovdqu $ACC8, 32*8-192($tp0) 396 vmovdqu $ACC0, 32*9-192($tp0) 397 lea 8($tp0), $tp0 398 399 vpmuludq 32*5-128($ap), $B2, $TEMP2 400 vpaddq $TEMP2, $ACC1, $ACC1 401 vpmuludq 32*5-128($aap), $B2, $TEMP0 402 vpaddq $TEMP0, $ACC2, $ACC2 403 vpmuludq 32*6-128($aap), $B2, $TEMP1 404 vpaddq $TEMP1, $ACC3, $ACC3 405 vpmuludq 32*7-128($aap), $B2, $ACC4 406 vpbroadcastq 32*7-128($tpa), $B2 407 vpaddq 32*13-448($tp1), $ACC4, $ACC4 408 409 vmovdqu $ACC1, 32*10-448($tp1) 410 vmovdqu $ACC2, 32*11-448($tp1) 411 412 vpmuludq 32*6-128($ap), $B1, $TEMP0 413 vpaddq $TEMP0, $ACC3, $ACC3 414 vpmuludq 32*6-128($aap), $B1, $TEMP1 415 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 416 vpaddq $TEMP1, $ACC4, $ACC4 417 vpmuludq 32*7-128($aap), $B1, $ACC5 418 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 419 vpaddq 32*14-448($tp1), $ACC5, $ACC5 420 421 vmovdqu $ACC3, 32*12-448($tp1) 422 vmovdqu $ACC4, 32*13-448($tp1) 423 lea 8($tpa), $tpa 424 425 vpmuludq 32*7-128($ap), $B2, $TEMP0 426 vpaddq $TEMP0, $ACC5, $ACC5 427 vpmuludq 32*7-128($aap), $B2, $ACC6 428 vpaddq 32*15-448($tp1), $ACC6, $ACC6 429 430 vpmuludq 32*8-128($ap), $ACC0, $ACC7 431 vmovdqu $ACC5, 32*14-448($tp1) 432 vpaddq 32*16-448($tp1), $ACC7, $ACC7 433 vmovdqu $ACC6, 32*15-448($tp1) 434 vmovdqu $ACC7, 32*16-448($tp1) 435 lea 8($tp1), $tp1 436 437 dec $i 438 jnz .LOOP_SQR_1024 439___ 440$ZERO = $ACC9; 441$TEMP0 = $B1; 442$TEMP2 = $B2; 443$TEMP3 = $Y1; 444$TEMP4 = $Y2; 445$code.=<<___; 446 #we need to fix indexes 32-39 to avoid overflow 447 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 448 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 449 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 450 lea 192(%rsp), $tp0 # 64+128=192 451 452 vpsrlq \$29, $ACC8, $TEMP1 453 vpand $AND_MASK, $ACC8, $ACC8 454 vpsrlq \$29, $ACC1, $TEMP2 455 vpand $AND_MASK, $ACC1, $ACC1 456 457 vpermq \$0x93, $TEMP1, $TEMP1 458 vpxor $ZERO, $ZERO, $ZERO 459 vpermq \$0x93, $TEMP2, $TEMP2 460 461 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 462 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 463 vpaddq $TEMP0, $ACC8, $ACC8 464 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 465 vpaddq $TEMP1, $ACC1, $ACC1 466 vpaddq $TEMP2, $ACC2, $ACC2 467 vmovdqu $ACC1, 32*9-192($tp0) 468 vmovdqu $ACC2, 32*10-192($tp0) 469 470 mov (%rsp), %rax 471 mov 8(%rsp), $r1 472 mov 16(%rsp), $r2 473 mov 24(%rsp), $r3 474 vmovdqu 32*1(%rsp), $ACC1 475 vmovdqu 32*2-192($tp0), $ACC2 476 vmovdqu 32*3-192($tp0), $ACC3 477 vmovdqu 32*4-192($tp0), $ACC4 478 vmovdqu 32*5-192($tp0), $ACC5 479 vmovdqu 32*6-192($tp0), $ACC6 480 vmovdqu 32*7-192($tp0), $ACC7 481 482 mov %rax, $r0 483 imull $n0, %eax 484 and \$0x1fffffff, %eax 485 vmovd %eax, $Y1 486 487 mov %rax, %rdx 488 imulq -128($np), %rax 489 vpbroadcastq $Y1, $Y1 490 add %rax, $r0 491 mov %rdx, %rax 492 imulq 8-128($np), %rax 493 shr \$29, $r0 494 add %rax, $r1 495 mov %rdx, %rax 496 imulq 16-128($np), %rax 497 add $r0, $r1 498 add %rax, $r2 499 imulq 24-128($np), %rdx 500 add %rdx, $r3 501 502 mov $r1, %rax 503 imull $n0, %eax 504 and \$0x1fffffff, %eax 505 506 mov \$9, $i 507 jmp .LOOP_REDUCE_1024 508 509.align 32 510.LOOP_REDUCE_1024: 511 vmovd %eax, $Y2 512 vpbroadcastq $Y2, $Y2 513 514 vpmuludq 32*1-128($np), $Y1, $TEMP0 515 mov %rax, %rdx 516 imulq -128($np), %rax 517 vpaddq $TEMP0, $ACC1, $ACC1 518 add %rax, $r1 519 vpmuludq 32*2-128($np), $Y1, $TEMP1 520 mov %rdx, %rax 521 imulq 8-128($np), %rax 522 vpaddq $TEMP1, $ACC2, $ACC2 523 vpmuludq 32*3-128($np), $Y1, $TEMP2 524 .byte 0x67 525 add %rax, $r2 526 .byte 0x67 527 mov %rdx, %rax 528 imulq 16-128($np), %rax 529 shr \$29, $r1 530 vpaddq $TEMP2, $ACC3, $ACC3 531 vpmuludq 32*4-128($np), $Y1, $TEMP0 532 add %rax, $r3 533 add $r1, $r2 534 vpaddq $TEMP0, $ACC4, $ACC4 535 vpmuludq 32*5-128($np), $Y1, $TEMP1 536 mov $r2, %rax 537 imull $n0, %eax 538 vpaddq $TEMP1, $ACC5, $ACC5 539 vpmuludq 32*6-128($np), $Y1, $TEMP2 540 and \$0x1fffffff, %eax 541 vpaddq $TEMP2, $ACC6, $ACC6 542 vpmuludq 32*7-128($np), $Y1, $TEMP0 543 vpaddq $TEMP0, $ACC7, $ACC7 544 vpmuludq 32*8-128($np), $Y1, $TEMP1 545 vmovd %eax, $Y1 546 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 547 vpaddq $TEMP1, $ACC8, $ACC8 548 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 549 vpbroadcastq $Y1, $Y1 550 551 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 552 vmovdqu 32*3-8-128($np), $TEMP1 553 mov %rax, %rdx 554 imulq -128($np), %rax 555 vpaddq $TEMP2, $ACC1, $ACC1 556 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 557 vmovdqu 32*4-8-128($np), $TEMP2 558 add %rax, $r2 559 mov %rdx, %rax 560 imulq 8-128($np), %rax 561 vpaddq $TEMP0, $ACC2, $ACC2 562 add $r3, %rax 563 shr \$29, $r2 564 vpmuludq $Y2, $TEMP1, $TEMP1 565 vmovdqu 32*5-8-128($np), $TEMP0 566 add $r2, %rax 567 vpaddq $TEMP1, $ACC3, $ACC3 568 vpmuludq $Y2, $TEMP2, $TEMP2 569 vmovdqu 32*6-8-128($np), $TEMP1 570 .byte 0x67 571 mov %rax, $r3 572 imull $n0, %eax 573 vpaddq $TEMP2, $ACC4, $ACC4 574 vpmuludq $Y2, $TEMP0, $TEMP0 575 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 576 and \$0x1fffffff, %eax 577 vpaddq $TEMP0, $ACC5, $ACC5 578 vpmuludq $Y2, $TEMP1, $TEMP1 579 vmovdqu 32*8-8-128($np), $TEMP0 580 vpaddq $TEMP1, $ACC6, $ACC6 581 vpmuludq $Y2, $TEMP2, $TEMP2 582 vmovdqu 32*9-8-128($np), $ACC9 583 vmovd %eax, $ACC0 # borrow ACC0 for Y2 584 imulq -128($np), %rax 585 vpaddq $TEMP2, $ACC7, $ACC7 586 vpmuludq $Y2, $TEMP0, $TEMP0 587 vmovdqu 32*1-16-128($np), $TEMP1 588 vpbroadcastq $ACC0, $ACC0 589 vpaddq $TEMP0, $ACC8, $ACC8 590 vpmuludq $Y2, $ACC9, $ACC9 591 vmovdqu 32*2-16-128($np), $TEMP2 592 add %rax, $r3 593 594___ 595($ACC0,$Y2)=($Y2,$ACC0); 596$code.=<<___; 597 vmovdqu 32*1-24-128($np), $ACC0 598 vpmuludq $Y1, $TEMP1, $TEMP1 599 vmovdqu 32*3-16-128($np), $TEMP0 600 vpaddq $TEMP1, $ACC1, $ACC1 601 vpmuludq $Y2, $ACC0, $ACC0 602 vpmuludq $Y1, $TEMP2, $TEMP2 603 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 604 vpaddq $ACC1, $ACC0, $ACC0 605 vpaddq $TEMP2, $ACC2, $ACC2 606 vpmuludq $Y1, $TEMP0, $TEMP0 607 vmovdqu 32*5-16-128($np), $TEMP2 608 .byte 0x67 609 vmovq $ACC0, %rax 610 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 611 vpaddq $TEMP0, $ACC3, $ACC3 612 vpmuludq $Y1, $TEMP1, $TEMP1 613 vmovdqu 32*6-16-128($np), $TEMP0 614 vpaddq $TEMP1, $ACC4, $ACC4 615 vpmuludq $Y1, $TEMP2, $TEMP2 616 vmovdqu 32*7-16-128($np), $TEMP1 617 vpaddq $TEMP2, $ACC5, $ACC5 618 vpmuludq $Y1, $TEMP0, $TEMP0 619 vmovdqu 32*8-16-128($np), $TEMP2 620 vpaddq $TEMP0, $ACC6, $ACC6 621 vpmuludq $Y1, $TEMP1, $TEMP1 622 shr \$29, $r3 623 vmovdqu 32*9-16-128($np), $TEMP0 624 add $r3, %rax 625 vpaddq $TEMP1, $ACC7, $ACC7 626 vpmuludq $Y1, $TEMP2, $TEMP2 627 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 628 mov %rax, $r0 629 imull $n0, %eax 630 vpaddq $TEMP2, $ACC8, $ACC8 631 vpmuludq $Y1, $TEMP0, $TEMP0 632 and \$0x1fffffff, %eax 633 vmovd %eax, $Y1 634 vmovdqu 32*3-24-128($np), $TEMP2 635 .byte 0x67 636 vpaddq $TEMP0, $ACC9, $ACC9 637 vpbroadcastq $Y1, $Y1 638 639 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 640 vmovdqu 32*4-24-128($np), $TEMP0 641 mov %rax, %rdx 642 imulq -128($np), %rax 643 mov 8(%rsp), $r1 644 vpaddq $TEMP1, $ACC2, $ACC1 645 vpmuludq $Y2, $TEMP2, $TEMP2 646 vmovdqu 32*5-24-128($np), $TEMP1 647 add %rax, $r0 648 mov %rdx, %rax 649 imulq 8-128($np), %rax 650 .byte 0x67 651 shr \$29, $r0 652 mov 16(%rsp), $r2 653 vpaddq $TEMP2, $ACC3, $ACC2 654 vpmuludq $Y2, $TEMP0, $TEMP0 655 vmovdqu 32*6-24-128($np), $TEMP2 656 add %rax, $r1 657 mov %rdx, %rax 658 imulq 16-128($np), %rax 659 vpaddq $TEMP0, $ACC4, $ACC3 660 vpmuludq $Y2, $TEMP1, $TEMP1 661 vmovdqu 32*7-24-128($np), $TEMP0 662 imulq 24-128($np), %rdx # future $r3 663 add %rax, $r2 664 lea ($r0,$r1), %rax 665 vpaddq $TEMP1, $ACC5, $ACC4 666 vpmuludq $Y2, $TEMP2, $TEMP2 667 vmovdqu 32*8-24-128($np), $TEMP1 668 mov %rax, $r1 669 imull $n0, %eax 670 vpmuludq $Y2, $TEMP0, $TEMP0 671 vpaddq $TEMP2, $ACC6, $ACC5 672 vmovdqu 32*9-24-128($np), $TEMP2 673 and \$0x1fffffff, %eax 674 vpaddq $TEMP0, $ACC7, $ACC6 675 vpmuludq $Y2, $TEMP1, $TEMP1 676 add 24(%rsp), %rdx 677 vpaddq $TEMP1, $ACC8, $ACC7 678 vpmuludq $Y2, $TEMP2, $TEMP2 679 vpaddq $TEMP2, $ACC9, $ACC8 680 vmovq $r3, $ACC9 681 mov %rdx, $r3 682 683 dec $i 684 jnz .LOOP_REDUCE_1024 685___ 686($ACC0,$Y2)=($Y2,$ACC0); 687$code.=<<___; 688 lea 448(%rsp), $tp1 # size optimization 689 vpaddq $ACC9, $Y2, $ACC0 690 vpxor $ZERO, $ZERO, $ZERO 691 692 vpaddq 32*9-192($tp0), $ACC0, $ACC0 693 vpaddq 32*10-448($tp1), $ACC1, $ACC1 694 vpaddq 32*11-448($tp1), $ACC2, $ACC2 695 vpaddq 32*12-448($tp1), $ACC3, $ACC3 696 vpaddq 32*13-448($tp1), $ACC4, $ACC4 697 vpaddq 32*14-448($tp1), $ACC5, $ACC5 698 vpaddq 32*15-448($tp1), $ACC6, $ACC6 699 vpaddq 32*16-448($tp1), $ACC7, $ACC7 700 vpaddq 32*17-448($tp1), $ACC8, $ACC8 701 702 vpsrlq \$29, $ACC0, $TEMP1 703 vpand $AND_MASK, $ACC0, $ACC0 704 vpsrlq \$29, $ACC1, $TEMP2 705 vpand $AND_MASK, $ACC1, $ACC1 706 vpsrlq \$29, $ACC2, $TEMP3 707 vpermq \$0x93, $TEMP1, $TEMP1 708 vpand $AND_MASK, $ACC2, $ACC2 709 vpsrlq \$29, $ACC3, $TEMP4 710 vpermq \$0x93, $TEMP2, $TEMP2 711 vpand $AND_MASK, $ACC3, $ACC3 712 vpermq \$0x93, $TEMP3, $TEMP3 713 714 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 715 vpermq \$0x93, $TEMP4, $TEMP4 716 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 717 vpaddq $TEMP0, $ACC0, $ACC0 718 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 719 vpaddq $TEMP1, $ACC1, $ACC1 720 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 721 vpaddq $TEMP2, $ACC2, $ACC2 722 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 723 vpaddq $TEMP3, $ACC3, $ACC3 724 vpaddq $TEMP4, $ACC4, $ACC4 725 726 vpsrlq \$29, $ACC0, $TEMP1 727 vpand $AND_MASK, $ACC0, $ACC0 728 vpsrlq \$29, $ACC1, $TEMP2 729 vpand $AND_MASK, $ACC1, $ACC1 730 vpsrlq \$29, $ACC2, $TEMP3 731 vpermq \$0x93, $TEMP1, $TEMP1 732 vpand $AND_MASK, $ACC2, $ACC2 733 vpsrlq \$29, $ACC3, $TEMP4 734 vpermq \$0x93, $TEMP2, $TEMP2 735 vpand $AND_MASK, $ACC3, $ACC3 736 vpermq \$0x93, $TEMP3, $TEMP3 737 738 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 739 vpermq \$0x93, $TEMP4, $TEMP4 740 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 741 vpaddq $TEMP0, $ACC0, $ACC0 742 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 743 vpaddq $TEMP1, $ACC1, $ACC1 744 vmovdqu $ACC0, 32*0-128($rp) 745 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 746 vpaddq $TEMP2, $ACC2, $ACC2 747 vmovdqu $ACC1, 32*1-128($rp) 748 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 749 vpaddq $TEMP3, $ACC3, $ACC3 750 vmovdqu $ACC2, 32*2-128($rp) 751 vpaddq $TEMP4, $ACC4, $ACC4 752 vmovdqu $ACC3, 32*3-128($rp) 753___ 754$TEMP5=$ACC0; 755$code.=<<___; 756 vpsrlq \$29, $ACC4, $TEMP1 757 vpand $AND_MASK, $ACC4, $ACC4 758 vpsrlq \$29, $ACC5, $TEMP2 759 vpand $AND_MASK, $ACC5, $ACC5 760 vpsrlq \$29, $ACC6, $TEMP3 761 vpermq \$0x93, $TEMP1, $TEMP1 762 vpand $AND_MASK, $ACC6, $ACC6 763 vpsrlq \$29, $ACC7, $TEMP4 764 vpermq \$0x93, $TEMP2, $TEMP2 765 vpand $AND_MASK, $ACC7, $ACC7 766 vpsrlq \$29, $ACC8, $TEMP5 767 vpermq \$0x93, $TEMP3, $TEMP3 768 vpand $AND_MASK, $ACC8, $ACC8 769 vpermq \$0x93, $TEMP4, $TEMP4 770 771 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 772 vpermq \$0x93, $TEMP5, $TEMP5 773 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 774 vpaddq $TEMP0, $ACC4, $ACC4 775 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 776 vpaddq $TEMP1, $ACC5, $ACC5 777 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 778 vpaddq $TEMP2, $ACC6, $ACC6 779 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 780 vpaddq $TEMP3, $ACC7, $ACC7 781 vpaddq $TEMP4, $ACC8, $ACC8 782 783 vpsrlq \$29, $ACC4, $TEMP1 784 vpand $AND_MASK, $ACC4, $ACC4 785 vpsrlq \$29, $ACC5, $TEMP2 786 vpand $AND_MASK, $ACC5, $ACC5 787 vpsrlq \$29, $ACC6, $TEMP3 788 vpermq \$0x93, $TEMP1, $TEMP1 789 vpand $AND_MASK, $ACC6, $ACC6 790 vpsrlq \$29, $ACC7, $TEMP4 791 vpermq \$0x93, $TEMP2, $TEMP2 792 vpand $AND_MASK, $ACC7, $ACC7 793 vpsrlq \$29, $ACC8, $TEMP5 794 vpermq \$0x93, $TEMP3, $TEMP3 795 vpand $AND_MASK, $ACC8, $ACC8 796 vpermq \$0x93, $TEMP4, $TEMP4 797 798 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 799 vpermq \$0x93, $TEMP5, $TEMP5 800 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 801 vpaddq $TEMP0, $ACC4, $ACC4 802 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 803 vpaddq $TEMP1, $ACC5, $ACC5 804 vmovdqu $ACC4, 32*4-128($rp) 805 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 806 vpaddq $TEMP2, $ACC6, $ACC6 807 vmovdqu $ACC5, 32*5-128($rp) 808 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 809 vpaddq $TEMP3, $ACC7, $ACC7 810 vmovdqu $ACC6, 32*6-128($rp) 811 vpaddq $TEMP4, $ACC8, $ACC8 812 vmovdqu $ACC7, 32*7-128($rp) 813 vmovdqu $ACC8, 32*8-128($rp) 814 815 mov $rp, $ap 816 dec $rep 817 jne .LOOP_GRANDE_SQR_1024 818 819 vzeroall 820 mov %rbp, %rax 821___ 822$code.=<<___ if ($win64); 823 movaps -0xd8(%rax),%xmm6 824 movaps -0xc8(%rax),%xmm7 825 movaps -0xb8(%rax),%xmm8 826 movaps -0xa8(%rax),%xmm9 827 movaps -0x98(%rax),%xmm10 828 movaps -0x88(%rax),%xmm11 829 movaps -0x78(%rax),%xmm12 830 movaps -0x68(%rax),%xmm13 831 movaps -0x58(%rax),%xmm14 832 movaps -0x48(%rax),%xmm15 833___ 834$code.=<<___; 835 mov -48(%rax),%r15 836 mov -40(%rax),%r14 837 mov -32(%rax),%r13 838 mov -24(%rax),%r12 839 mov -16(%rax),%rbp 840 mov -8(%rax),%rbx 841 lea (%rax),%rsp # restore %rsp 842.Lsqr_1024_epilogue: 843 ret 844.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 845___ 846} 847 848{ # void AMM_WW( 849my $rp="%rdi"; # BN_ULONG *rp, 850my $ap="%rsi"; # const BN_ULONG *ap, 851my $bp="%rdx"; # const BN_ULONG *bp, 852my $np="%rcx"; # const BN_ULONG *np, 853my $n0="%r8d"; # unsigned int n0); 854 855# The registers that hold the accumulated redundant result 856# The AMM works on 1024 bit operands, and redundant word size is 29 857# Therefore: ceil(1024/29)/4 = 9 858my $ACC0="%ymm0"; 859my $ACC1="%ymm1"; 860my $ACC2="%ymm2"; 861my $ACC3="%ymm3"; 862my $ACC4="%ymm4"; 863my $ACC5="%ymm5"; 864my $ACC6="%ymm6"; 865my $ACC7="%ymm7"; 866my $ACC8="%ymm8"; 867my $ACC9="%ymm9"; 868 869# Registers that hold the broadcasted words of multiplier, currently used 870my $Bi="%ymm10"; 871my $Yi="%ymm11"; 872 873# Helper registers 874my $TEMP0=$ACC0; 875my $TEMP1="%ymm12"; 876my $TEMP2="%ymm13"; 877my $ZERO="%ymm14"; 878my $AND_MASK="%ymm15"; 879 880# alu registers that hold the first words of the ACC 881my $r0="%r9"; 882my $r1="%r10"; 883my $r2="%r11"; 884my $r3="%r12"; 885 886my $i="%r14d"; 887my $tmp="%r15"; 888 889$bp="%r13"; # reassigned argument 890 891$code.=<<___; 892.globl rsaz_1024_mul_avx2 893.type rsaz_1024_mul_avx2,\@function,5 894.align 64 895rsaz_1024_mul_avx2: 896 lea (%rsp), %rax 897 push %rbx 898 push %rbp 899 push %r12 900 push %r13 901 push %r14 902 push %r15 903___ 904$code.=<<___ if ($win64); 905 vzeroupper 906 lea -0xa8(%rsp),%rsp 907 vmovaps %xmm6,-0xd8(%rax) 908 vmovaps %xmm7,-0xc8(%rax) 909 vmovaps %xmm8,-0xb8(%rax) 910 vmovaps %xmm9,-0xa8(%rax) 911 vmovaps %xmm10,-0x98(%rax) 912 vmovaps %xmm11,-0x88(%rax) 913 vmovaps %xmm12,-0x78(%rax) 914 vmovaps %xmm13,-0x68(%rax) 915 vmovaps %xmm14,-0x58(%rax) 916 vmovaps %xmm15,-0x48(%rax) 917.Lmul_1024_body: 918___ 919$code.=<<___; 920 mov %rax,%rbp 921 vzeroall 922 mov %rdx, $bp # reassigned argument 923 sub \$64,%rsp 924 925 # unaligned 256-bit load that crosses page boundary can 926 # cause severe performance degradation here, so if $ap does 927 # cross page boundary, swap it with $bp [meaning that caller 928 # is advised to lay down $ap and $bp next to each other, so 929 # that only one can cross page boundary]. 930 .byte 0x67,0x67 931 mov $ap, $tmp 932 and \$4095, $tmp 933 add \$32*10, $tmp 934 shr \$12, $tmp 935 mov $ap, $tmp 936 cmovnz $bp, $ap 937 cmovnz $tmp, $bp 938 939 mov $np, $tmp 940 sub \$-128,$ap # size optimization 941 sub \$-128,$np 942 sub \$-128,$rp 943 944 and \$4095, $tmp # see if $np crosses page 945 add \$32*10, $tmp 946 .byte 0x67,0x67 947 shr \$12, $tmp 948 jz .Lmul_1024_no_n_copy 949 950 # unaligned 256-bit load that crosses page boundary can 951 # cause severe performance degradation here, so if $np does 952 # cross page boundary, copy it to stack and make sure stack 953 # frame doesn't... 954 sub \$32*10,%rsp 955 vmovdqu 32*0-128($np), $ACC0 956 and \$-512, %rsp 957 vmovdqu 32*1-128($np), $ACC1 958 vmovdqu 32*2-128($np), $ACC2 959 vmovdqu 32*3-128($np), $ACC3 960 vmovdqu 32*4-128($np), $ACC4 961 vmovdqu 32*5-128($np), $ACC5 962 vmovdqu 32*6-128($np), $ACC6 963 vmovdqu 32*7-128($np), $ACC7 964 vmovdqu 32*8-128($np), $ACC8 965 lea 64+128(%rsp),$np 966 vmovdqu $ACC0, 32*0-128($np) 967 vpxor $ACC0, $ACC0, $ACC0 968 vmovdqu $ACC1, 32*1-128($np) 969 vpxor $ACC1, $ACC1, $ACC1 970 vmovdqu $ACC2, 32*2-128($np) 971 vpxor $ACC2, $ACC2, $ACC2 972 vmovdqu $ACC3, 32*3-128($np) 973 vpxor $ACC3, $ACC3, $ACC3 974 vmovdqu $ACC4, 32*4-128($np) 975 vpxor $ACC4, $ACC4, $ACC4 976 vmovdqu $ACC5, 32*5-128($np) 977 vpxor $ACC5, $ACC5, $ACC5 978 vmovdqu $ACC6, 32*6-128($np) 979 vpxor $ACC6, $ACC6, $ACC6 980 vmovdqu $ACC7, 32*7-128($np) 981 vpxor $ACC7, $ACC7, $ACC7 982 vmovdqu $ACC8, 32*8-128($np) 983 vmovdqa $ACC0, $ACC8 984 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 985.Lmul_1024_no_n_copy: 986 and \$-64,%rsp 987 988 mov ($bp), %rbx 989 vpbroadcastq ($bp), $Bi 990 vmovdqu $ACC0, (%rsp) # clear top of stack 991 xor $r0, $r0 992 .byte 0x67 993 xor $r1, $r1 994 xor $r2, $r2 995 xor $r3, $r3 996 997 vmovdqu .Land_mask(%rip), $AND_MASK 998 mov \$9, $i 999 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 1000 jmp .Loop_mul_1024 1001 1002.align 32 1003.Loop_mul_1024: 1004 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 1005 mov %rbx, %rax 1006 imulq -128($ap), %rax 1007 add $r0, %rax 1008 mov %rbx, $r1 1009 imulq 8-128($ap), $r1 1010 add 8(%rsp), $r1 1011 1012 mov %rax, $r0 1013 imull $n0, %eax 1014 and \$0x1fffffff, %eax 1015 1016 mov %rbx, $r2 1017 imulq 16-128($ap), $r2 1018 add 16(%rsp), $r2 1019 1020 mov %rbx, $r3 1021 imulq 24-128($ap), $r3 1022 add 24(%rsp), $r3 1023 vpmuludq 32*1-128($ap),$Bi,$TEMP0 1024 vmovd %eax, $Yi 1025 vpaddq $TEMP0,$ACC1,$ACC1 1026 vpmuludq 32*2-128($ap),$Bi,$TEMP1 1027 vpbroadcastq $Yi, $Yi 1028 vpaddq $TEMP1,$ACC2,$ACC2 1029 vpmuludq 32*3-128($ap),$Bi,$TEMP2 1030 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 1031 vpaddq $TEMP2,$ACC3,$ACC3 1032 vpmuludq 32*4-128($ap),$Bi,$TEMP0 1033 vpaddq $TEMP0,$ACC4,$ACC4 1034 vpmuludq 32*5-128($ap),$Bi,$TEMP1 1035 vpaddq $TEMP1,$ACC5,$ACC5 1036 vpmuludq 32*6-128($ap),$Bi,$TEMP2 1037 vpaddq $TEMP2,$ACC6,$ACC6 1038 vpmuludq 32*7-128($ap),$Bi,$TEMP0 1039 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 1040 vpaddq $TEMP0,$ACC7,$ACC7 1041 vpmuludq 32*8-128($ap),$Bi,$TEMP1 1042 vpbroadcastq 8($bp), $Bi 1043 vpaddq $TEMP1,$ACC8,$ACC8 1044 1045 mov %rax,%rdx 1046 imulq -128($np),%rax 1047 add %rax,$r0 1048 mov %rdx,%rax 1049 imulq 8-128($np),%rax 1050 add %rax,$r1 1051 mov %rdx,%rax 1052 imulq 16-128($np),%rax 1053 add %rax,$r2 1054 shr \$29, $r0 1055 imulq 24-128($np),%rdx 1056 add %rdx,$r3 1057 add $r0, $r1 1058 1059 vpmuludq 32*1-128($np),$Yi,$TEMP2 1060 vmovq $Bi, %rbx 1061 vpaddq $TEMP2,$ACC1,$ACC1 1062 vpmuludq 32*2-128($np),$Yi,$TEMP0 1063 vpaddq $TEMP0,$ACC2,$ACC2 1064 vpmuludq 32*3-128($np),$Yi,$TEMP1 1065 vpaddq $TEMP1,$ACC3,$ACC3 1066 vpmuludq 32*4-128($np),$Yi,$TEMP2 1067 vpaddq $TEMP2,$ACC4,$ACC4 1068 vpmuludq 32*5-128($np),$Yi,$TEMP0 1069 vpaddq $TEMP0,$ACC5,$ACC5 1070 vpmuludq 32*6-128($np),$Yi,$TEMP1 1071 vpaddq $TEMP1,$ACC6,$ACC6 1072 vpmuludq 32*7-128($np),$Yi,$TEMP2 1073 vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 1074 vpaddq $TEMP2,$ACC7,$ACC7 1075 vpmuludq 32*8-128($np),$Yi,$TEMP0 1076 vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 1077 vpaddq $TEMP0,$ACC8,$ACC8 1078 1079 mov %rbx, %rax 1080 imulq -128($ap),%rax 1081 add %rax,$r1 1082 vmovdqu -8+32*1-128($ap),$TEMP1 1083 mov %rbx, %rax 1084 imulq 8-128($ap),%rax 1085 add %rax,$r2 1086 vmovdqu -8+32*2-128($ap),$TEMP2 1087 1088 mov $r1, %rax 1089 imull $n0, %eax 1090 and \$0x1fffffff, %eax 1091 1092 imulq 16-128($ap),%rbx 1093 add %rbx,$r3 1094 vpmuludq $Bi,$TEMP1,$TEMP1 1095 vmovd %eax, $Yi 1096 vmovdqu -8+32*3-128($ap),$TEMP0 1097 vpaddq $TEMP1,$ACC1,$ACC1 1098 vpmuludq $Bi,$TEMP2,$TEMP2 1099 vpbroadcastq $Yi, $Yi 1100 vmovdqu -8+32*4-128($ap),$TEMP1 1101 vpaddq $TEMP2,$ACC2,$ACC2 1102 vpmuludq $Bi,$TEMP0,$TEMP0 1103 vmovdqu -8+32*5-128($ap),$TEMP2 1104 vpaddq $TEMP0,$ACC3,$ACC3 1105 vpmuludq $Bi,$TEMP1,$TEMP1 1106 vmovdqu -8+32*6-128($ap),$TEMP0 1107 vpaddq $TEMP1,$ACC4,$ACC4 1108 vpmuludq $Bi,$TEMP2,$TEMP2 1109 vmovdqu -8+32*7-128($ap),$TEMP1 1110 vpaddq $TEMP2,$ACC5,$ACC5 1111 vpmuludq $Bi,$TEMP0,$TEMP0 1112 vmovdqu -8+32*8-128($ap),$TEMP2 1113 vpaddq $TEMP0,$ACC6,$ACC6 1114 vpmuludq $Bi,$TEMP1,$TEMP1 1115 vmovdqu -8+32*9-128($ap),$ACC9 1116 vpaddq $TEMP1,$ACC7,$ACC7 1117 vpmuludq $Bi,$TEMP2,$TEMP2 1118 vpaddq $TEMP2,$ACC8,$ACC8 1119 vpmuludq $Bi,$ACC9,$ACC9 1120 vpbroadcastq 16($bp), $Bi 1121 1122 mov %rax,%rdx 1123 imulq -128($np),%rax 1124 add %rax,$r1 1125 vmovdqu -8+32*1-128($np),$TEMP0 1126 mov %rdx,%rax 1127 imulq 8-128($np),%rax 1128 add %rax,$r2 1129 vmovdqu -8+32*2-128($np),$TEMP1 1130 shr \$29, $r1 1131 imulq 16-128($np),%rdx 1132 add %rdx,$r3 1133 add $r1, $r2 1134 1135 vpmuludq $Yi,$TEMP0,$TEMP0 1136 vmovq $Bi, %rbx 1137 vmovdqu -8+32*3-128($np),$TEMP2 1138 vpaddq $TEMP0,$ACC1,$ACC1 1139 vpmuludq $Yi,$TEMP1,$TEMP1 1140 vmovdqu -8+32*4-128($np),$TEMP0 1141 vpaddq $TEMP1,$ACC2,$ACC2 1142 vpmuludq $Yi,$TEMP2,$TEMP2 1143 vmovdqu -8+32*5-128($np),$TEMP1 1144 vpaddq $TEMP2,$ACC3,$ACC3 1145 vpmuludq $Yi,$TEMP0,$TEMP0 1146 vmovdqu -8+32*6-128($np),$TEMP2 1147 vpaddq $TEMP0,$ACC4,$ACC4 1148 vpmuludq $Yi,$TEMP1,$TEMP1 1149 vmovdqu -8+32*7-128($np),$TEMP0 1150 vpaddq $TEMP1,$ACC5,$ACC5 1151 vpmuludq $Yi,$TEMP2,$TEMP2 1152 vmovdqu -8+32*8-128($np),$TEMP1 1153 vpaddq $TEMP2,$ACC6,$ACC6 1154 vpmuludq $Yi,$TEMP0,$TEMP0 1155 vmovdqu -8+32*9-128($np),$TEMP2 1156 vpaddq $TEMP0,$ACC7,$ACC7 1157 vpmuludq $Yi,$TEMP1,$TEMP1 1158 vpaddq $TEMP1,$ACC8,$ACC8 1159 vpmuludq $Yi,$TEMP2,$TEMP2 1160 vpaddq $TEMP2,$ACC9,$ACC9 1161 1162 vmovdqu -16+32*1-128($ap),$TEMP0 1163 mov %rbx,%rax 1164 imulq -128($ap),%rax 1165 add $r2,%rax 1166 1167 vmovdqu -16+32*2-128($ap),$TEMP1 1168 mov %rax,$r2 1169 imull $n0, %eax 1170 and \$0x1fffffff, %eax 1171 1172 imulq 8-128($ap),%rbx 1173 add %rbx,$r3 1174 vpmuludq $Bi,$TEMP0,$TEMP0 1175 vmovd %eax, $Yi 1176 vmovdqu -16+32*3-128($ap),$TEMP2 1177 vpaddq $TEMP0,$ACC1,$ACC1 1178 vpmuludq $Bi,$TEMP1,$TEMP1 1179 vpbroadcastq $Yi, $Yi 1180 vmovdqu -16+32*4-128($ap),$TEMP0 1181 vpaddq $TEMP1,$ACC2,$ACC2 1182 vpmuludq $Bi,$TEMP2,$TEMP2 1183 vmovdqu -16+32*5-128($ap),$TEMP1 1184 vpaddq $TEMP2,$ACC3,$ACC3 1185 vpmuludq $Bi,$TEMP0,$TEMP0 1186 vmovdqu -16+32*6-128($ap),$TEMP2 1187 vpaddq $TEMP0,$ACC4,$ACC4 1188 vpmuludq $Bi,$TEMP1,$TEMP1 1189 vmovdqu -16+32*7-128($ap),$TEMP0 1190 vpaddq $TEMP1,$ACC5,$ACC5 1191 vpmuludq $Bi,$TEMP2,$TEMP2 1192 vmovdqu -16+32*8-128($ap),$TEMP1 1193 vpaddq $TEMP2,$ACC6,$ACC6 1194 vpmuludq $Bi,$TEMP0,$TEMP0 1195 vmovdqu -16+32*9-128($ap),$TEMP2 1196 vpaddq $TEMP0,$ACC7,$ACC7 1197 vpmuludq $Bi,$TEMP1,$TEMP1 1198 vpaddq $TEMP1,$ACC8,$ACC8 1199 vpmuludq $Bi,$TEMP2,$TEMP2 1200 vpbroadcastq 24($bp), $Bi 1201 vpaddq $TEMP2,$ACC9,$ACC9 1202 1203 vmovdqu -16+32*1-128($np),$TEMP0 1204 mov %rax,%rdx 1205 imulq -128($np),%rax 1206 add %rax,$r2 1207 vmovdqu -16+32*2-128($np),$TEMP1 1208 imulq 8-128($np),%rdx 1209 add %rdx,$r3 1210 shr \$29, $r2 1211 1212 vpmuludq $Yi,$TEMP0,$TEMP0 1213 vmovq $Bi, %rbx 1214 vmovdqu -16+32*3-128($np),$TEMP2 1215 vpaddq $TEMP0,$ACC1,$ACC1 1216 vpmuludq $Yi,$TEMP1,$TEMP1 1217 vmovdqu -16+32*4-128($np),$TEMP0 1218 vpaddq $TEMP1,$ACC2,$ACC2 1219 vpmuludq $Yi,$TEMP2,$TEMP2 1220 vmovdqu -16+32*5-128($np),$TEMP1 1221 vpaddq $TEMP2,$ACC3,$ACC3 1222 vpmuludq $Yi,$TEMP0,$TEMP0 1223 vmovdqu -16+32*6-128($np),$TEMP2 1224 vpaddq $TEMP0,$ACC4,$ACC4 1225 vpmuludq $Yi,$TEMP1,$TEMP1 1226 vmovdqu -16+32*7-128($np),$TEMP0 1227 vpaddq $TEMP1,$ACC5,$ACC5 1228 vpmuludq $Yi,$TEMP2,$TEMP2 1229 vmovdqu -16+32*8-128($np),$TEMP1 1230 vpaddq $TEMP2,$ACC6,$ACC6 1231 vpmuludq $Yi,$TEMP0,$TEMP0 1232 vmovdqu -16+32*9-128($np),$TEMP2 1233 vpaddq $TEMP0,$ACC7,$ACC7 1234 vpmuludq $Yi,$TEMP1,$TEMP1 1235 vmovdqu -24+32*1-128($ap),$TEMP0 1236 vpaddq $TEMP1,$ACC8,$ACC8 1237 vpmuludq $Yi,$TEMP2,$TEMP2 1238 vmovdqu -24+32*2-128($ap),$TEMP1 1239 vpaddq $TEMP2,$ACC9,$ACC9 1240 1241 add $r2, $r3 1242 imulq -128($ap),%rbx 1243 add %rbx,$r3 1244 1245 mov $r3, %rax 1246 imull $n0, %eax 1247 and \$0x1fffffff, %eax 1248 1249 vpmuludq $Bi,$TEMP0,$TEMP0 1250 vmovd %eax, $Yi 1251 vmovdqu -24+32*3-128($ap),$TEMP2 1252 vpaddq $TEMP0,$ACC1,$ACC1 1253 vpmuludq $Bi,$TEMP1,$TEMP1 1254 vpbroadcastq $Yi, $Yi 1255 vmovdqu -24+32*4-128($ap),$TEMP0 1256 vpaddq $TEMP1,$ACC2,$ACC2 1257 vpmuludq $Bi,$TEMP2,$TEMP2 1258 vmovdqu -24+32*5-128($ap),$TEMP1 1259 vpaddq $TEMP2,$ACC3,$ACC3 1260 vpmuludq $Bi,$TEMP0,$TEMP0 1261 vmovdqu -24+32*6-128($ap),$TEMP2 1262 vpaddq $TEMP0,$ACC4,$ACC4 1263 vpmuludq $Bi,$TEMP1,$TEMP1 1264 vmovdqu -24+32*7-128($ap),$TEMP0 1265 vpaddq $TEMP1,$ACC5,$ACC5 1266 vpmuludq $Bi,$TEMP2,$TEMP2 1267 vmovdqu -24+32*8-128($ap),$TEMP1 1268 vpaddq $TEMP2,$ACC6,$ACC6 1269 vpmuludq $Bi,$TEMP0,$TEMP0 1270 vmovdqu -24+32*9-128($ap),$TEMP2 1271 vpaddq $TEMP0,$ACC7,$ACC7 1272 vpmuludq $Bi,$TEMP1,$TEMP1 1273 vpaddq $TEMP1,$ACC8,$ACC8 1274 vpmuludq $Bi,$TEMP2,$TEMP2 1275 vpbroadcastq 32($bp), $Bi 1276 vpaddq $TEMP2,$ACC9,$ACC9 1277 add \$32, $bp # $bp++ 1278 1279 vmovdqu -24+32*1-128($np),$TEMP0 1280 imulq -128($np),%rax 1281 add %rax,$r3 1282 shr \$29, $r3 1283 1284 vmovdqu -24+32*2-128($np),$TEMP1 1285 vpmuludq $Yi,$TEMP0,$TEMP0 1286 vmovq $Bi, %rbx 1287 vmovdqu -24+32*3-128($np),$TEMP2 1288 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 1289 vpmuludq $Yi,$TEMP1,$TEMP1 1290 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 1291 vpaddq $TEMP1,$ACC2,$ACC1 1292 vmovdqu -24+32*4-128($np),$TEMP0 1293 vpmuludq $Yi,$TEMP2,$TEMP2 1294 vmovdqu -24+32*5-128($np),$TEMP1 1295 vpaddq $TEMP2,$ACC3,$ACC2 1296 vpmuludq $Yi,$TEMP0,$TEMP0 1297 vmovdqu -24+32*6-128($np),$TEMP2 1298 vpaddq $TEMP0,$ACC4,$ACC3 1299 vpmuludq $Yi,$TEMP1,$TEMP1 1300 vmovdqu -24+32*7-128($np),$TEMP0 1301 vpaddq $TEMP1,$ACC5,$ACC4 1302 vpmuludq $Yi,$TEMP2,$TEMP2 1303 vmovdqu -24+32*8-128($np),$TEMP1 1304 vpaddq $TEMP2,$ACC6,$ACC5 1305 vpmuludq $Yi,$TEMP0,$TEMP0 1306 vmovdqu -24+32*9-128($np),$TEMP2 1307 mov $r3, $r0 1308 vpaddq $TEMP0,$ACC7,$ACC6 1309 vpmuludq $Yi,$TEMP1,$TEMP1 1310 add (%rsp), $r0 1311 vpaddq $TEMP1,$ACC8,$ACC7 1312 vpmuludq $Yi,$TEMP2,$TEMP2 1313 vmovq $r3, $TEMP1 1314 vpaddq $TEMP2,$ACC9,$ACC8 1315 1316 dec $i 1317 jnz .Loop_mul_1024 1318___ 1319 1320# (*) Original implementation was correcting ACC1-ACC3 for overflow 1321# after 7 loop runs, or after 28 iterations, or 56 additions. 1322# But as we underutilize resources, it's possible to correct in 1323# each iteration with marginal performance loss. But then, as 1324# we do it in each iteration, we can correct less digits, and 1325# avoid performance penalties completely. Also note that we 1326# correct only three digits out of four. This works because 1327# most significant digit is subjected to less additions. 1328 1329$TEMP0 = $ACC9; 1330$TEMP3 = $Bi; 1331$TEMP4 = $Yi; 1332$code.=<<___; 1333 vpermq \$0, $AND_MASK, $AND_MASK 1334 vpaddq (%rsp), $TEMP1, $ACC0 1335 1336 vpsrlq \$29, $ACC0, $TEMP1 1337 vpand $AND_MASK, $ACC0, $ACC0 1338 vpsrlq \$29, $ACC1, $TEMP2 1339 vpand $AND_MASK, $ACC1, $ACC1 1340 vpsrlq \$29, $ACC2, $TEMP3 1341 vpermq \$0x93, $TEMP1, $TEMP1 1342 vpand $AND_MASK, $ACC2, $ACC2 1343 vpsrlq \$29, $ACC3, $TEMP4 1344 vpermq \$0x93, $TEMP2, $TEMP2 1345 vpand $AND_MASK, $ACC3, $ACC3 1346 1347 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1348 vpermq \$0x93, $TEMP3, $TEMP3 1349 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1350 vpermq \$0x93, $TEMP4, $TEMP4 1351 vpaddq $TEMP0, $ACC0, $ACC0 1352 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1353 vpaddq $TEMP1, $ACC1, $ACC1 1354 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1355 vpaddq $TEMP2, $ACC2, $ACC2 1356 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1357 vpaddq $TEMP3, $ACC3, $ACC3 1358 vpaddq $TEMP4, $ACC4, $ACC4 1359 1360 vpsrlq \$29, $ACC0, $TEMP1 1361 vpand $AND_MASK, $ACC0, $ACC0 1362 vpsrlq \$29, $ACC1, $TEMP2 1363 vpand $AND_MASK, $ACC1, $ACC1 1364 vpsrlq \$29, $ACC2, $TEMP3 1365 vpermq \$0x93, $TEMP1, $TEMP1 1366 vpand $AND_MASK, $ACC2, $ACC2 1367 vpsrlq \$29, $ACC3, $TEMP4 1368 vpermq \$0x93, $TEMP2, $TEMP2 1369 vpand $AND_MASK, $ACC3, $ACC3 1370 vpermq \$0x93, $TEMP3, $TEMP3 1371 1372 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1373 vpermq \$0x93, $TEMP4, $TEMP4 1374 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1375 vpaddq $TEMP0, $ACC0, $ACC0 1376 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1377 vpaddq $TEMP1, $ACC1, $ACC1 1378 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1379 vpaddq $TEMP2, $ACC2, $ACC2 1380 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1381 vpaddq $TEMP3, $ACC3, $ACC3 1382 vpaddq $TEMP4, $ACC4, $ACC4 1383 1384 vmovdqu $ACC0, 0-128($rp) 1385 vmovdqu $ACC1, 32-128($rp) 1386 vmovdqu $ACC2, 64-128($rp) 1387 vmovdqu $ACC3, 96-128($rp) 1388___ 1389 1390$TEMP5=$ACC0; 1391$code.=<<___; 1392 vpsrlq \$29, $ACC4, $TEMP1 1393 vpand $AND_MASK, $ACC4, $ACC4 1394 vpsrlq \$29, $ACC5, $TEMP2 1395 vpand $AND_MASK, $ACC5, $ACC5 1396 vpsrlq \$29, $ACC6, $TEMP3 1397 vpermq \$0x93, $TEMP1, $TEMP1 1398 vpand $AND_MASK, $ACC6, $ACC6 1399 vpsrlq \$29, $ACC7, $TEMP4 1400 vpermq \$0x93, $TEMP2, $TEMP2 1401 vpand $AND_MASK, $ACC7, $ACC7 1402 vpsrlq \$29, $ACC8, $TEMP5 1403 vpermq \$0x93, $TEMP3, $TEMP3 1404 vpand $AND_MASK, $ACC8, $ACC8 1405 vpermq \$0x93, $TEMP4, $TEMP4 1406 1407 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1408 vpermq \$0x93, $TEMP5, $TEMP5 1409 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1410 vpaddq $TEMP0, $ACC4, $ACC4 1411 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1412 vpaddq $TEMP1, $ACC5, $ACC5 1413 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1414 vpaddq $TEMP2, $ACC6, $ACC6 1415 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1416 vpaddq $TEMP3, $ACC7, $ACC7 1417 vpaddq $TEMP4, $ACC8, $ACC8 1418 1419 vpsrlq \$29, $ACC4, $TEMP1 1420 vpand $AND_MASK, $ACC4, $ACC4 1421 vpsrlq \$29, $ACC5, $TEMP2 1422 vpand $AND_MASK, $ACC5, $ACC5 1423 vpsrlq \$29, $ACC6, $TEMP3 1424 vpermq \$0x93, $TEMP1, $TEMP1 1425 vpand $AND_MASK, $ACC6, $ACC6 1426 vpsrlq \$29, $ACC7, $TEMP4 1427 vpermq \$0x93, $TEMP2, $TEMP2 1428 vpand $AND_MASK, $ACC7, $ACC7 1429 vpsrlq \$29, $ACC8, $TEMP5 1430 vpermq \$0x93, $TEMP3, $TEMP3 1431 vpand $AND_MASK, $ACC8, $ACC8 1432 vpermq \$0x93, $TEMP4, $TEMP4 1433 1434 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1435 vpermq \$0x93, $TEMP5, $TEMP5 1436 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1437 vpaddq $TEMP0, $ACC4, $ACC4 1438 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1439 vpaddq $TEMP1, $ACC5, $ACC5 1440 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1441 vpaddq $TEMP2, $ACC6, $ACC6 1442 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1443 vpaddq $TEMP3, $ACC7, $ACC7 1444 vpaddq $TEMP4, $ACC8, $ACC8 1445 1446 vmovdqu $ACC4, 128-128($rp) 1447 vmovdqu $ACC5, 160-128($rp) 1448 vmovdqu $ACC6, 192-128($rp) 1449 vmovdqu $ACC7, 224-128($rp) 1450 vmovdqu $ACC8, 256-128($rp) 1451 vzeroupper 1452 1453 mov %rbp, %rax 1454___ 1455$code.=<<___ if ($win64); 1456 movaps -0xd8(%rax),%xmm6 1457 movaps -0xc8(%rax),%xmm7 1458 movaps -0xb8(%rax),%xmm8 1459 movaps -0xa8(%rax),%xmm9 1460 movaps -0x98(%rax),%xmm10 1461 movaps -0x88(%rax),%xmm11 1462 movaps -0x78(%rax),%xmm12 1463 movaps -0x68(%rax),%xmm13 1464 movaps -0x58(%rax),%xmm14 1465 movaps -0x48(%rax),%xmm15 1466___ 1467$code.=<<___; 1468 mov -48(%rax),%r15 1469 mov -40(%rax),%r14 1470 mov -32(%rax),%r13 1471 mov -24(%rax),%r12 1472 mov -16(%rax),%rbp 1473 mov -8(%rax),%rbx 1474 lea (%rax),%rsp # restore %rsp 1475.Lmul_1024_epilogue: 1476 ret 1477.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 1478___ 1479} 1480{ 1481my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 1482my @T = map("%r$_",(8..11)); 1483 1484$code.=<<___; 1485.globl rsaz_1024_red2norm_avx2 1486.type rsaz_1024_red2norm_avx2,\@abi-omnipotent 1487.align 32 1488rsaz_1024_red2norm_avx2: 1489 sub \$-128,$inp # size optimization 1490 xor %rax,%rax 1491___ 1492 1493for ($j=0,$i=0; $i<16; $i++) { 1494 my $k=0; 1495 while (29*$j<64*($i+1)) { # load data till boundary 1496 $code.=" mov `8*$j-128`($inp), @T[0]\n"; 1497 $j++; $k++; push(@T,shift(@T)); 1498 } 1499 $l=$k; 1500 while ($k>1) { # shift loaded data but last value 1501 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 1502 $k--; 1503 } 1504 $code.=<<___; # shift last value 1505 mov @T[-1], @T[0] 1506 shl \$`29*($j-1)`, @T[-1] 1507 shr \$`-29*($j-1)`, @T[0] 1508___ 1509 while ($l) { # accumulate all values 1510 $code.=" add @T[-$l], %rax\n"; 1511 $l--; 1512 } 1513 $code.=<<___; 1514 adc \$0, @T[0] # consume eventual carry 1515 mov %rax, 8*$i($out) 1516 mov @T[0], %rax 1517___ 1518 push(@T,shift(@T)); 1519} 1520$code.=<<___; 1521 ret 1522.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 1523 1524.globl rsaz_1024_norm2red_avx2 1525.type rsaz_1024_norm2red_avx2,\@abi-omnipotent 1526.align 32 1527rsaz_1024_norm2red_avx2: 1528 sub \$-128,$out # size optimization 1529 mov ($inp),@T[0] 1530 mov \$0x1fffffff,%eax 1531___ 1532for ($j=0,$i=0; $i<16; $i++) { 1533 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 1534 $code.=" xor @T[1],@T[1]\n" if ($i==15); 1535 my $k=1; 1536 while (29*($j+1)<64*($i+1)) { 1537 $code.=<<___; 1538 mov @T[0],@T[-$k] 1539 shr \$`29*$j`,@T[-$k] 1540 and %rax,@T[-$k] # &0x1fffffff 1541 mov @T[-$k],`8*$j-128`($out) 1542___ 1543 $j++; $k++; 1544 } 1545 $code.=<<___; 1546 shrd \$`29*$j`,@T[1],@T[0] 1547 and %rax,@T[0] 1548 mov @T[0],`8*$j-128`($out) 1549___ 1550 $j++; 1551 push(@T,shift(@T)); 1552} 1553$code.=<<___; 1554 mov @T[0],`8*$j-128`($out) # zero 1555 mov @T[0],`8*($j+1)-128`($out) 1556 mov @T[0],`8*($j+2)-128`($out) 1557 mov @T[0],`8*($j+3)-128`($out) 1558 ret 1559.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 1560___ 1561} 1562{ 1563my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1564 1565$code.=<<___; 1566.globl rsaz_1024_scatter5_avx2 1567.type rsaz_1024_scatter5_avx2,\@abi-omnipotent 1568.align 32 1569rsaz_1024_scatter5_avx2: 1570 vzeroupper 1571 vmovdqu .Lscatter_permd(%rip),%ymm5 1572 shl \$4,$power 1573 lea ($out,$power),$out 1574 mov \$9,%eax 1575 jmp .Loop_scatter_1024 1576 1577.align 32 1578.Loop_scatter_1024: 1579 vmovdqu ($inp),%ymm0 1580 lea 32($inp),$inp 1581 vpermd %ymm0,%ymm5,%ymm0 1582 vmovdqu %xmm0,($out) 1583 lea 16*32($out),$out 1584 dec %eax 1585 jnz .Loop_scatter_1024 1586 1587 vzeroupper 1588 ret 1589.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1590 1591.globl rsaz_1024_gather5_avx2 1592.type rsaz_1024_gather5_avx2,\@abi-omnipotent 1593.align 32 1594rsaz_1024_gather5_avx2: 1595___ 1596$code.=<<___ if ($win64); 1597 lea -0x88(%rsp),%rax 1598 vzeroupper 1599.LSEH_begin_rsaz_1024_gather5: 1600 # I can't trust assembler to use specific encoding:-( 1601 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1602 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax) 1603 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax) 1604 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax) 1605 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax) 1606 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax) 1607 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax) 1608 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax) 1609 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax) 1610 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax) 1611 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax) 1612___ 1613$code.=<<___; 1614 lea .Lgather_table(%rip),%r11 1615 mov $power,%eax 1616 and \$3,$power 1617 shr \$2,%eax # cache line number 1618 shl \$4,$power # offset within cache line 1619 1620 vmovdqu -32(%r11),%ymm7 # .Lgather_permd 1621 vpbroadcastb 8(%r11,%rax), %xmm8 1622 vpbroadcastb 7(%r11,%rax), %xmm9 1623 vpbroadcastb 6(%r11,%rax), %xmm10 1624 vpbroadcastb 5(%r11,%rax), %xmm11 1625 vpbroadcastb 4(%r11,%rax), %xmm12 1626 vpbroadcastb 3(%r11,%rax), %xmm13 1627 vpbroadcastb 2(%r11,%rax), %xmm14 1628 vpbroadcastb 1(%r11,%rax), %xmm15 1629 1630 lea 64($inp,$power),$inp 1631 mov \$64,%r11 # size optimization 1632 mov \$9,%eax 1633 jmp .Loop_gather_1024 1634 1635.align 32 1636.Loop_gather_1024: 1637 vpand -64($inp), %xmm8,%xmm0 1638 vpand ($inp), %xmm9,%xmm1 1639 vpand 64($inp), %xmm10,%xmm2 1640 vpand ($inp,%r11,2), %xmm11,%xmm3 1641 vpor %xmm0,%xmm1,%xmm1 1642 vpand 64($inp,%r11,2), %xmm12,%xmm4 1643 vpor %xmm2,%xmm3,%xmm3 1644 vpand ($inp,%r11,4), %xmm13,%xmm5 1645 vpor %xmm1,%xmm3,%xmm3 1646 vpand 64($inp,%r11,4), %xmm14,%xmm6 1647 vpor %xmm4,%xmm5,%xmm5 1648 vpand -128($inp,%r11,8), %xmm15,%xmm2 1649 lea ($inp,%r11,8),$inp 1650 vpor %xmm3,%xmm5,%xmm5 1651 vpor %xmm2,%xmm6,%xmm6 1652 vpor %xmm5,%xmm6,%xmm6 1653 vpermd %ymm6,%ymm7,%ymm6 1654 vmovdqu %ymm6,($out) 1655 lea 32($out),$out 1656 dec %eax 1657 jnz .Loop_gather_1024 1658 1659 vpxor %ymm0,%ymm0,%ymm0 1660 vmovdqu %ymm0,($out) 1661 vzeroupper 1662___ 1663$code.=<<___ if ($win64); 1664 movaps (%rsp),%xmm6 1665 movaps 0x10(%rsp),%xmm7 1666 movaps 0x20(%rsp),%xmm8 1667 movaps 0x30(%rsp),%xmm9 1668 movaps 0x40(%rsp),%xmm10 1669 movaps 0x50(%rsp),%xmm11 1670 movaps 0x60(%rsp),%xmm12 1671 movaps 0x70(%rsp),%xmm13 1672 movaps 0x80(%rsp),%xmm14 1673 movaps 0x90(%rsp),%xmm15 1674 lea 0xa8(%rsp),%rsp 1675.LSEH_end_rsaz_1024_gather5: 1676___ 1677$code.=<<___; 1678 ret 1679.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1680___ 1681} 1682 1683$code.=<<___; 1684.extern OPENSSL_ia32cap_P 1685.globl rsaz_avx2_eligible 1686.type rsaz_avx2_eligible,\@abi-omnipotent 1687.align 32 1688rsaz_avx2_eligible: 1689 mov OPENSSL_ia32cap_P+8(%rip),%eax 1690___ 1691$code.=<<___ if ($addx); 1692 mov \$`1<<8|1<<19`,%ecx 1693 mov \$0,%edx 1694 and %eax,%ecx 1695 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X 1696 cmove %edx,%eax 1697___ 1698$code.=<<___; 1699 and \$`1<<5`,%eax 1700 shr \$5,%eax 1701 ret 1702.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1703 1704.align 64 1705.Land_mask: 1706 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 1707.Lscatter_permd: 1708 .long 0,2,4,6,7,7,7,7 1709.Lgather_permd: 1710 .long 0,7,1,7,2,7,3,7 1711.Lgather_table: 1712 .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 1713.align 64 1714___ 1715 1716if ($win64) { 1717$rec="%rcx"; 1718$frame="%rdx"; 1719$context="%r8"; 1720$disp="%r9"; 1721 1722$code.=<<___ 1723.extern __imp_RtlVirtualUnwind 1724.type rsaz_se_handler,\@abi-omnipotent 1725.align 16 1726rsaz_se_handler: 1727 push %rsi 1728 push %rdi 1729 push %rbx 1730 push %rbp 1731 push %r12 1732 push %r13 1733 push %r14 1734 push %r15 1735 pushfq 1736 sub \$64,%rsp 1737 1738 mov 120($context),%rax # pull context->Rax 1739 mov 248($context),%rbx # pull context->Rip 1740 1741 mov 8($disp),%rsi # disp->ImageBase 1742 mov 56($disp),%r11 # disp->HandlerData 1743 1744 mov 0(%r11),%r10d # HandlerData[0] 1745 lea (%rsi,%r10),%r10 # prologue label 1746 cmp %r10,%rbx # context->Rip<prologue label 1747 jb .Lcommon_seh_tail 1748 1749 mov 152($context),%rax # pull context->Rsp 1750 1751 mov 4(%r11),%r10d # HandlerData[1] 1752 lea (%rsi,%r10),%r10 # epilogue label 1753 cmp %r10,%rbx # context->Rip>=epilogue label 1754 jae .Lcommon_seh_tail 1755 1756 mov 160($context),%rax # pull context->Rbp 1757 1758 mov -48(%rax),%r15 1759 mov -40(%rax),%r14 1760 mov -32(%rax),%r13 1761 mov -24(%rax),%r12 1762 mov -16(%rax),%rbp 1763 mov -8(%rax),%rbx 1764 mov %r15,240($context) 1765 mov %r14,232($context) 1766 mov %r13,224($context) 1767 mov %r12,216($context) 1768 mov %rbp,160($context) 1769 mov %rbx,144($context) 1770 1771 lea -0xd8(%rax),%rsi # %xmm save area 1772 lea 512($context),%rdi # & context.Xmm6 1773 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1774 .long 0xa548f3fc # cld; rep movsq 1775 1776.Lcommon_seh_tail: 1777 mov 8(%rax),%rdi 1778 mov 16(%rax),%rsi 1779 mov %rax,152($context) # restore context->Rsp 1780 mov %rsi,168($context) # restore context->Rsi 1781 mov %rdi,176($context) # restore context->Rdi 1782 1783 mov 40($disp),%rdi # disp->ContextRecord 1784 mov $context,%rsi # context 1785 mov \$154,%ecx # sizeof(CONTEXT) 1786 .long 0xa548f3fc # cld; rep movsq 1787 1788 mov $disp,%rsi 1789 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1790 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1791 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1792 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1793 mov 40(%rsi),%r10 # disp->ContextRecord 1794 lea 56(%rsi),%r11 # &disp->HandlerData 1795 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1796 mov %r10,32(%rsp) # arg5 1797 mov %r11,40(%rsp) # arg6 1798 mov %r12,48(%rsp) # arg7 1799 mov %rcx,56(%rsp) # arg8, (NULL) 1800 call *__imp_RtlVirtualUnwind(%rip) 1801 1802 mov \$1,%eax # ExceptionContinueSearch 1803 add \$64,%rsp 1804 popfq 1805 pop %r15 1806 pop %r14 1807 pop %r13 1808 pop %r12 1809 pop %rbp 1810 pop %rbx 1811 pop %rdi 1812 pop %rsi 1813 ret 1814.size rsaz_se_handler,.-rsaz_se_handler 1815 1816.section .pdata 1817.align 4 1818 .rva .LSEH_begin_rsaz_1024_sqr_avx2 1819 .rva .LSEH_end_rsaz_1024_sqr_avx2 1820 .rva .LSEH_info_rsaz_1024_sqr_avx2 1821 1822 .rva .LSEH_begin_rsaz_1024_mul_avx2 1823 .rva .LSEH_end_rsaz_1024_mul_avx2 1824 .rva .LSEH_info_rsaz_1024_mul_avx2 1825 1826 .rva .LSEH_begin_rsaz_1024_gather5 1827 .rva .LSEH_end_rsaz_1024_gather5 1828 .rva .LSEH_info_rsaz_1024_gather5 1829.section .xdata 1830.align 8 1831.LSEH_info_rsaz_1024_sqr_avx2: 1832 .byte 9,0,0,0 1833 .rva rsaz_se_handler 1834 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue 1835.LSEH_info_rsaz_1024_mul_avx2: 1836 .byte 9,0,0,0 1837 .rva rsaz_se_handler 1838 .rva .Lmul_1024_body,.Lmul_1024_epilogue 1839.LSEH_info_rsaz_1024_gather5: 1840 .byte 0x01,0x33,0x16,0x00 1841 .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15 1842 .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14 1843 .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13 1844 .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12 1845 .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11 1846 .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10 1847 .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9 1848 .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8 1849 .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7 1850 .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6 1851 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1852___ 1853} 1854 1855foreach (split("\n",$code)) { 1856 s/\`([^\`]*)\`/eval($1)/ge; 1857 1858 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1859 1860 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1861 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1862 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1863 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1864 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1865 print $_,"\n"; 1866} 1867 1868}}} else {{{ 1869print <<___; # assembler is too old 1870.text 1871 1872.globl rsaz_avx2_eligible 1873.type rsaz_avx2_eligible,\@abi-omnipotent 1874rsaz_avx2_eligible: 1875 xor %eax,%eax 1876 ret 1877.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1878 1879.globl rsaz_1024_sqr_avx2 1880.globl rsaz_1024_mul_avx2 1881.globl rsaz_1024_norm2red_avx2 1882.globl rsaz_1024_red2norm_avx2 1883.globl rsaz_1024_scatter5_avx2 1884.globl rsaz_1024_gather5_avx2 1885.type rsaz_1024_sqr_avx2,\@abi-omnipotent 1886rsaz_1024_sqr_avx2: 1887rsaz_1024_mul_avx2: 1888rsaz_1024_norm2red_avx2: 1889rsaz_1024_red2norm_avx2: 1890rsaz_1024_scatter5_avx2: 1891rsaz_1024_gather5_avx2: 1892 .byte 0x0f,0x0b # ud2 1893 ret 1894.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 1895___ 1896}}} 1897 1898close STDOUT; 1899