1#!/usr/bin/env perl 2 3# Copyright (c) 2014, Intel Corporation. 4# 5# Permission to use, copy, modify, and/or distribute this software for any 6# purpose with or without fee is hereby granted, provided that the above 7# copyright notice and this permission notice appear in all copies. 8# 9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 17# Developers and authors: 18# Shay Gueron (1, 2), and Vlad Krasnov (1) 19# (1) Intel Corporation, Israel Development Center 20# (2) University of Haifa 21 22# Reference: 23# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 24# 256 Bit Primes" 25 26# Further optimization by <appro@openssl.org>: 27# 28# this/original 29# Opteron +12-49% 30# Bulldozer +14-45% 31# P4 +18-46% 32# Westmere +12-34% 33# Sandy Bridge +9-35% 34# Ivy Bridge +9-35% 35# Haswell +8-37% 36# Broadwell +18-58% 37# Atom +15-50% 38# VIA Nano +43-160% 39# 40# Ranges denote minimum and maximum improvement coefficients depending 41# on benchmark. 42 43$flavour = shift; 44$output = shift; 45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52die "can't locate x86_64-xlate.pl"; 53 54open OUT,"| \"$^X\" $xlate $flavour $output"; 55*STDOUT=*OUT; 56 57# TODO: enable these after testing. $avx goes to two and $addx to one. 58$avx=0; 59$addx=0; 60 61$code.=<<___; 62.text 63.extern OPENSSL_ia32cap_P 64 65# The polynomial 66.align 64 67.Lpoly: 68.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 69 70.LOne: 71.long 1,1,1,1,1,1,1,1 72.LTwo: 73.long 2,2,2,2,2,2,2,2 74.LThree: 75.long 3,3,3,3,3,3,3,3 76.LONE_mont: 77.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 78___ 79 80{ 81################################################################################ 82# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); 83 84my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 85my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 86my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 87 88$code.=<<___; 89 90.type ecp_nistz256_mul_by_2,\@function,2 91.align 64 92ecp_nistz256_mul_by_2: 93 push %r12 94 push %r13 95 96 mov 8*0($a_ptr), $a0 97 mov 8*1($a_ptr), $a1 98 add $a0, $a0 # a0:a3+a0:a3 99 mov 8*2($a_ptr), $a2 100 adc $a1, $a1 101 mov 8*3($a_ptr), $a3 102 lea .Lpoly(%rip), $a_ptr 103 mov $a0, $t0 104 adc $a2, $a2 105 adc $a3, $a3 106 mov $a1, $t1 107 sbb $t4, $t4 108 109 sub 8*0($a_ptr), $a0 110 mov $a2, $t2 111 sbb 8*1($a_ptr), $a1 112 sbb 8*2($a_ptr), $a2 113 mov $a3, $t3 114 sbb 8*3($a_ptr), $a3 115 test $t4, $t4 116 117 cmovz $t0, $a0 118 cmovz $t1, $a1 119 mov $a0, 8*0($r_ptr) 120 cmovz $t2, $a2 121 mov $a1, 8*1($r_ptr) 122 cmovz $t3, $a3 123 mov $a2, 8*2($r_ptr) 124 mov $a3, 8*3($r_ptr) 125 126 pop %r13 127 pop %r12 128 ret 129.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 130 131################################################################################ 132# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 133.globl ecp_nistz256_neg 134.type ecp_nistz256_neg,\@function,2 135.align 32 136ecp_nistz256_neg: 137 push %r12 138 push %r13 139 140 xor $a0, $a0 141 xor $a1, $a1 142 xor $a2, $a2 143 xor $a3, $a3 144 xor $t4, $t4 145 146 sub 8*0($a_ptr), $a0 147 sbb 8*1($a_ptr), $a1 148 sbb 8*2($a_ptr), $a2 149 mov $a0, $t0 150 sbb 8*3($a_ptr), $a3 151 lea .Lpoly(%rip), $a_ptr 152 mov $a1, $t1 153 sbb \$0, $t4 154 155 add 8*0($a_ptr), $a0 156 mov $a2, $t2 157 adc 8*1($a_ptr), $a1 158 adc 8*2($a_ptr), $a2 159 mov $a3, $t3 160 adc 8*3($a_ptr), $a3 161 test $t4, $t4 162 163 cmovz $t0, $a0 164 cmovz $t1, $a1 165 mov $a0, 8*0($r_ptr) 166 cmovz $t2, $a2 167 mov $a1, 8*1($r_ptr) 168 cmovz $t3, $a3 169 mov $a2, 8*2($r_ptr) 170 mov $a3, 8*3($r_ptr) 171 172 pop %r13 173 pop %r12 174 ret 175.size ecp_nistz256_neg,.-ecp_nistz256_neg 176___ 177} 178{ 179my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 180my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 181my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 182my ($poly1,$poly3)=($acc6,$acc7); 183 184$code.=<<___; 185################################################################################ 186# void ecp_nistz256_mul_mont( 187# uint64_t res[4], 188# uint64_t a[4], 189# uint64_t b[4]); 190 191.globl ecp_nistz256_mul_mont 192.type ecp_nistz256_mul_mont,\@function,3 193.align 32 194ecp_nistz256_mul_mont: 195___ 196$code.=<<___ if ($addx); 197 mov \$0x80100, %ecx 198 and OPENSSL_ia32cap_P+8(%rip), %ecx 199___ 200$code.=<<___; 201.Lmul_mont: 202 push %rbp 203 push %rbx 204 push %r12 205 push %r13 206 push %r14 207 push %r15 208___ 209$code.=<<___ if ($addx); 210 cmp \$0x80100, %ecx 211 je .Lmul_montx 212___ 213$code.=<<___; 214 mov $b_org, $b_ptr 215 mov 8*0($b_org), %rax 216 mov 8*0($a_ptr), $acc1 217 mov 8*1($a_ptr), $acc2 218 mov 8*2($a_ptr), $acc3 219 mov 8*3($a_ptr), $acc4 220 221 call __ecp_nistz256_mul_montq 222___ 223$code.=<<___ if ($addx); 224 jmp .Lmul_mont_done 225 226.align 32 227.Lmul_montx: 228 mov $b_org, $b_ptr 229 mov 8*0($b_org), %rdx 230 mov 8*0($a_ptr), $acc1 231 mov 8*1($a_ptr), $acc2 232 mov 8*2($a_ptr), $acc3 233 mov 8*3($a_ptr), $acc4 234 lea -128($a_ptr), $a_ptr # control u-op density 235 236 call __ecp_nistz256_mul_montx 237___ 238$code.=<<___; 239.Lmul_mont_done: 240 pop %r15 241 pop %r14 242 pop %r13 243 pop %r12 244 pop %rbx 245 pop %rbp 246 ret 247.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 248 249.type __ecp_nistz256_mul_montq,\@abi-omnipotent 250.align 32 251__ecp_nistz256_mul_montq: 252 ######################################################################## 253 # Multiply a by b[0] 254 mov %rax, $t1 255 mulq $acc1 256 mov .Lpoly+8*1(%rip),$poly1 257 mov %rax, $acc0 258 mov $t1, %rax 259 mov %rdx, $acc1 260 261 mulq $acc2 262 mov .Lpoly+8*3(%rip),$poly3 263 add %rax, $acc1 264 mov $t1, %rax 265 adc \$0, %rdx 266 mov %rdx, $acc2 267 268 mulq $acc3 269 add %rax, $acc2 270 mov $t1, %rax 271 adc \$0, %rdx 272 mov %rdx, $acc3 273 274 mulq $acc4 275 add %rax, $acc3 276 mov $acc0, %rax 277 adc \$0, %rdx 278 xor $acc5, $acc5 279 mov %rdx, $acc4 280 281 ######################################################################## 282 # First reduction step 283 # Basically now we want to multiply acc[0] by p256, 284 # and add the result to the acc. 285 # Due to the special form of p256 we do some optimizations 286 # 287 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 288 # then we add acc[0] and get acc[0] x 2^96 289 290 mov $acc0, $t1 291 shl \$32, $acc0 292 mulq $poly3 293 shr \$32, $t1 294 add $acc0, $acc1 # +=acc[0]<<96 295 adc $t1, $acc2 296 adc %rax, $acc3 297 mov 8*1($b_ptr), %rax 298 adc %rdx, $acc4 299 adc \$0, $acc5 300 xor $acc0, $acc0 301 302 ######################################################################## 303 # Multiply by b[1] 304 mov %rax, $t1 305 mulq 8*0($a_ptr) 306 add %rax, $acc1 307 mov $t1, %rax 308 adc \$0, %rdx 309 mov %rdx, $t0 310 311 mulq 8*1($a_ptr) 312 add $t0, $acc2 313 adc \$0, %rdx 314 add %rax, $acc2 315 mov $t1, %rax 316 adc \$0, %rdx 317 mov %rdx, $t0 318 319 mulq 8*2($a_ptr) 320 add $t0, $acc3 321 adc \$0, %rdx 322 add %rax, $acc3 323 mov $t1, %rax 324 adc \$0, %rdx 325 mov %rdx, $t0 326 327 mulq 8*3($a_ptr) 328 add $t0, $acc4 329 adc \$0, %rdx 330 add %rax, $acc4 331 mov $acc1, %rax 332 adc %rdx, $acc5 333 adc \$0, $acc0 334 335 ######################################################################## 336 # Second reduction step 337 mov $acc1, $t1 338 shl \$32, $acc1 339 mulq $poly3 340 shr \$32, $t1 341 add $acc1, $acc2 342 adc $t1, $acc3 343 adc %rax, $acc4 344 mov 8*2($b_ptr), %rax 345 adc %rdx, $acc5 346 adc \$0, $acc0 347 xor $acc1, $acc1 348 349 ######################################################################## 350 # Multiply by b[2] 351 mov %rax, $t1 352 mulq 8*0($a_ptr) 353 add %rax, $acc2 354 mov $t1, %rax 355 adc \$0, %rdx 356 mov %rdx, $t0 357 358 mulq 8*1($a_ptr) 359 add $t0, $acc3 360 adc \$0, %rdx 361 add %rax, $acc3 362 mov $t1, %rax 363 adc \$0, %rdx 364 mov %rdx, $t0 365 366 mulq 8*2($a_ptr) 367 add $t0, $acc4 368 adc \$0, %rdx 369 add %rax, $acc4 370 mov $t1, %rax 371 adc \$0, %rdx 372 mov %rdx, $t0 373 374 mulq 8*3($a_ptr) 375 add $t0, $acc5 376 adc \$0, %rdx 377 add %rax, $acc5 378 mov $acc2, %rax 379 adc %rdx, $acc0 380 adc \$0, $acc1 381 382 ######################################################################## 383 # Third reduction step 384 mov $acc2, $t1 385 shl \$32, $acc2 386 mulq $poly3 387 shr \$32, $t1 388 add $acc2, $acc3 389 adc $t1, $acc4 390 adc %rax, $acc5 391 mov 8*3($b_ptr), %rax 392 adc %rdx, $acc0 393 adc \$0, $acc1 394 xor $acc2, $acc2 395 396 ######################################################################## 397 # Multiply by b[3] 398 mov %rax, $t1 399 mulq 8*0($a_ptr) 400 add %rax, $acc3 401 mov $t1, %rax 402 adc \$0, %rdx 403 mov %rdx, $t0 404 405 mulq 8*1($a_ptr) 406 add $t0, $acc4 407 adc \$0, %rdx 408 add %rax, $acc4 409 mov $t1, %rax 410 adc \$0, %rdx 411 mov %rdx, $t0 412 413 mulq 8*2($a_ptr) 414 add $t0, $acc5 415 adc \$0, %rdx 416 add %rax, $acc5 417 mov $t1, %rax 418 adc \$0, %rdx 419 mov %rdx, $t0 420 421 mulq 8*3($a_ptr) 422 add $t0, $acc0 423 adc \$0, %rdx 424 add %rax, $acc0 425 mov $acc3, %rax 426 adc %rdx, $acc1 427 adc \$0, $acc2 428 429 ######################################################################## 430 # Final reduction step 431 mov $acc3, $t1 432 shl \$32, $acc3 433 mulq $poly3 434 shr \$32, $t1 435 add $acc3, $acc4 436 adc $t1, $acc5 437 mov $acc4, $t0 438 adc %rax, $acc0 439 adc %rdx, $acc1 440 mov $acc5, $t1 441 adc \$0, $acc2 442 443 ######################################################################## 444 # Branch-less conditional subtraction of P 445 sub \$-1, $acc4 # .Lpoly[0] 446 mov $acc0, $t2 447 sbb $poly1, $acc5 # .Lpoly[1] 448 sbb \$0, $acc0 # .Lpoly[2] 449 mov $acc1, $t3 450 sbb $poly3, $acc1 # .Lpoly[3] 451 sbb \$0, $acc2 452 453 cmovc $t0, $acc4 454 cmovc $t1, $acc5 455 mov $acc4, 8*0($r_ptr) 456 cmovc $t2, $acc0 457 mov $acc5, 8*1($r_ptr) 458 cmovc $t3, $acc1 459 mov $acc0, 8*2($r_ptr) 460 mov $acc1, 8*3($r_ptr) 461 462 ret 463.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 464 465################################################################################ 466# void ecp_nistz256_sqr_mont( 467# uint64_t res[4], 468# uint64_t a[4]); 469 470# we optimize the square according to S.Gueron and V.Krasnov, 471# "Speeding up Big-Number Squaring" 472.globl ecp_nistz256_sqr_mont 473.type ecp_nistz256_sqr_mont,\@function,2 474.align 32 475ecp_nistz256_sqr_mont: 476___ 477$code.=<<___ if ($addx); 478 mov \$0x80100, %ecx 479 and OPENSSL_ia32cap_P+8(%rip), %ecx 480___ 481$code.=<<___; 482 push %rbp 483 push %rbx 484 push %r12 485 push %r13 486 push %r14 487 push %r15 488___ 489$code.=<<___ if ($addx); 490 cmp \$0x80100, %ecx 491 je .Lsqr_montx 492___ 493$code.=<<___; 494 mov 8*0($a_ptr), %rax 495 mov 8*1($a_ptr), $acc6 496 mov 8*2($a_ptr), $acc7 497 mov 8*3($a_ptr), $acc0 498 499 call __ecp_nistz256_sqr_montq 500___ 501$code.=<<___ if ($addx); 502 jmp .Lsqr_mont_done 503 504.align 32 505.Lsqr_montx: 506 mov 8*0($a_ptr), %rdx 507 mov 8*1($a_ptr), $acc6 508 mov 8*2($a_ptr), $acc7 509 mov 8*3($a_ptr), $acc0 510 lea -128($a_ptr), $a_ptr # control u-op density 511 512 call __ecp_nistz256_sqr_montx 513___ 514$code.=<<___; 515.Lsqr_mont_done: 516 pop %r15 517 pop %r14 518 pop %r13 519 pop %r12 520 pop %rbx 521 pop %rbp 522 ret 523.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 524 525.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 526.align 32 527__ecp_nistz256_sqr_montq: 528 mov %rax, $acc5 529 mulq $acc6 # a[1]*a[0] 530 mov %rax, $acc1 531 mov $acc7, %rax 532 mov %rdx, $acc2 533 534 mulq $acc5 # a[0]*a[2] 535 add %rax, $acc2 536 mov $acc0, %rax 537 adc \$0, %rdx 538 mov %rdx, $acc3 539 540 mulq $acc5 # a[0]*a[3] 541 add %rax, $acc3 542 mov $acc7, %rax 543 adc \$0, %rdx 544 mov %rdx, $acc4 545 546 ################################# 547 mulq $acc6 # a[1]*a[2] 548 add %rax, $acc3 549 mov $acc0, %rax 550 adc \$0, %rdx 551 mov %rdx, $t1 552 553 mulq $acc6 # a[1]*a[3] 554 add %rax, $acc4 555 mov $acc0, %rax 556 adc \$0, %rdx 557 add $t1, $acc4 558 mov %rdx, $acc5 559 adc \$0, $acc5 560 561 ################################# 562 mulq $acc7 # a[2]*a[3] 563 xor $acc7, $acc7 564 add %rax, $acc5 565 mov 8*0($a_ptr), %rax 566 mov %rdx, $acc6 567 adc \$0, $acc6 568 569 add $acc1, $acc1 # acc1:6<<1 570 adc $acc2, $acc2 571 adc $acc3, $acc3 572 adc $acc4, $acc4 573 adc $acc5, $acc5 574 adc $acc6, $acc6 575 adc \$0, $acc7 576 577 mulq %rax 578 mov %rax, $acc0 579 mov 8*1($a_ptr), %rax 580 mov %rdx, $t0 581 582 mulq %rax 583 add $t0, $acc1 584 adc %rax, $acc2 585 mov 8*2($a_ptr), %rax 586 adc \$0, %rdx 587 mov %rdx, $t0 588 589 mulq %rax 590 add $t0, $acc3 591 adc %rax, $acc4 592 mov 8*3($a_ptr), %rax 593 adc \$0, %rdx 594 mov %rdx, $t0 595 596 mulq %rax 597 add $t0, $acc5 598 adc %rax, $acc6 599 mov $acc0, %rax 600 adc %rdx, $acc7 601 602 mov .Lpoly+8*1(%rip), $a_ptr 603 mov .Lpoly+8*3(%rip), $t1 604 605 ########################################## 606 # Now the reduction 607 # First iteration 608 mov $acc0, $t0 609 shl \$32, $acc0 610 mulq $t1 611 shr \$32, $t0 612 add $acc0, $acc1 # +=acc[0]<<96 613 adc $t0, $acc2 614 adc %rax, $acc3 615 mov $acc1, %rax 616 adc \$0, %rdx 617 618 ########################################## 619 # Second iteration 620 mov $acc1, $t0 621 shl \$32, $acc1 622 mov %rdx, $acc0 623 mulq $t1 624 shr \$32, $t0 625 add $acc1, $acc2 626 adc $t0, $acc3 627 adc %rax, $acc0 628 mov $acc2, %rax 629 adc \$0, %rdx 630 631 ########################################## 632 # Third iteration 633 mov $acc2, $t0 634 shl \$32, $acc2 635 mov %rdx, $acc1 636 mulq $t1 637 shr \$32, $t0 638 add $acc2, $acc3 639 adc $t0, $acc0 640 adc %rax, $acc1 641 mov $acc3, %rax 642 adc \$0, %rdx 643 644 ########################################### 645 # Last iteration 646 mov $acc3, $t0 647 shl \$32, $acc3 648 mov %rdx, $acc2 649 mulq $t1 650 shr \$32, $t0 651 add $acc3, $acc0 652 adc $t0, $acc1 653 adc %rax, $acc2 654 adc \$0, %rdx 655 xor $acc3, $acc3 656 657 ############################################ 658 # Add the rest of the acc 659 add $acc0, $acc4 660 adc $acc1, $acc5 661 mov $acc4, $acc0 662 adc $acc2, $acc6 663 adc %rdx, $acc7 664 mov $acc5, $acc1 665 adc \$0, $acc3 666 667 sub \$-1, $acc4 # .Lpoly[0] 668 mov $acc6, $acc2 669 sbb $a_ptr, $acc5 # .Lpoly[1] 670 sbb \$0, $acc6 # .Lpoly[2] 671 mov $acc7, $t0 672 sbb $t1, $acc7 # .Lpoly[3] 673 sbb \$0, $acc3 674 675 cmovc $acc0, $acc4 676 cmovc $acc1, $acc5 677 mov $acc4, 8*0($r_ptr) 678 cmovc $acc2, $acc6 679 mov $acc5, 8*1($r_ptr) 680 cmovc $t0, $acc7 681 mov $acc6, 8*2($r_ptr) 682 mov $acc7, 8*3($r_ptr) 683 684 ret 685.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 686___ 687 688if ($addx) { 689$code.=<<___; 690.type __ecp_nistz256_mul_montx,\@abi-omnipotent 691.align 32 692__ecp_nistz256_mul_montx: 693 ######################################################################## 694 # Multiply by b[0] 695 mulx $acc1, $acc0, $acc1 696 mulx $acc2, $t0, $acc2 697 mov \$32, $poly1 698 xor $acc5, $acc5 # cf=0 699 mulx $acc3, $t1, $acc3 700 mov .Lpoly+8*3(%rip), $poly3 701 adc $t0, $acc1 702 mulx $acc4, $t0, $acc4 703 mov $acc0, %rdx 704 adc $t1, $acc2 705 shlx $poly1,$acc0,$t1 706 adc $t0, $acc3 707 shrx $poly1,$acc0,$t0 708 adc \$0, $acc4 709 710 ######################################################################## 711 # First reduction step 712 add $t1, $acc1 713 adc $t0, $acc2 714 715 mulx $poly3, $t0, $t1 716 mov 8*1($b_ptr), %rdx 717 adc $t0, $acc3 718 adc $t1, $acc4 719 adc \$0, $acc5 720 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 721 722 ######################################################################## 723 # Multiply by b[1] 724 mulx 8*0+128($a_ptr), $t0, $t1 725 adcx $t0, $acc1 726 adox $t1, $acc2 727 728 mulx 8*1+128($a_ptr), $t0, $t1 729 adcx $t0, $acc2 730 adox $t1, $acc3 731 732 mulx 8*2+128($a_ptr), $t0, $t1 733 adcx $t0, $acc3 734 adox $t1, $acc4 735 736 mulx 8*3+128($a_ptr), $t0, $t1 737 mov $acc1, %rdx 738 adcx $t0, $acc4 739 shlx $poly1, $acc1, $t0 740 adox $t1, $acc5 741 shrx $poly1, $acc1, $t1 742 743 adcx $acc0, $acc5 744 adox $acc0, $acc0 745 adc \$0, $acc0 746 747 ######################################################################## 748 # Second reduction step 749 add $t0, $acc2 750 adc $t1, $acc3 751 752 mulx $poly3, $t0, $t1 753 mov 8*2($b_ptr), %rdx 754 adc $t0, $acc4 755 adc $t1, $acc5 756 adc \$0, $acc0 757 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 758 759 ######################################################################## 760 # Multiply by b[2] 761 mulx 8*0+128($a_ptr), $t0, $t1 762 adcx $t0, $acc2 763 adox $t1, $acc3 764 765 mulx 8*1+128($a_ptr), $t0, $t1 766 adcx $t0, $acc3 767 adox $t1, $acc4 768 769 mulx 8*2+128($a_ptr), $t0, $t1 770 adcx $t0, $acc4 771 adox $t1, $acc5 772 773 mulx 8*3+128($a_ptr), $t0, $t1 774 mov $acc2, %rdx 775 adcx $t0, $acc5 776 shlx $poly1, $acc2, $t0 777 adox $t1, $acc0 778 shrx $poly1, $acc2, $t1 779 780 adcx $acc1, $acc0 781 adox $acc1, $acc1 782 adc \$0, $acc1 783 784 ######################################################################## 785 # Third reduction step 786 add $t0, $acc3 787 adc $t1, $acc4 788 789 mulx $poly3, $t0, $t1 790 mov 8*3($b_ptr), %rdx 791 adc $t0, $acc5 792 adc $t1, $acc0 793 adc \$0, $acc1 794 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 795 796 ######################################################################## 797 # Multiply by b[3] 798 mulx 8*0+128($a_ptr), $t0, $t1 799 adcx $t0, $acc3 800 adox $t1, $acc4 801 802 mulx 8*1+128($a_ptr), $t0, $t1 803 adcx $t0, $acc4 804 adox $t1, $acc5 805 806 mulx 8*2+128($a_ptr), $t0, $t1 807 adcx $t0, $acc5 808 adox $t1, $acc0 809 810 mulx 8*3+128($a_ptr), $t0, $t1 811 mov $acc3, %rdx 812 adcx $t0, $acc0 813 shlx $poly1, $acc3, $t0 814 adox $t1, $acc1 815 shrx $poly1, $acc3, $t1 816 817 adcx $acc2, $acc1 818 adox $acc2, $acc2 819 adc \$0, $acc2 820 821 ######################################################################## 822 # Fourth reduction step 823 add $t0, $acc4 824 adc $t1, $acc5 825 826 mulx $poly3, $t0, $t1 827 mov $acc4, $t2 828 mov .Lpoly+8*1(%rip), $poly1 829 adc $t0, $acc0 830 mov $acc5, $t3 831 adc $t1, $acc1 832 adc \$0, $acc2 833 834 ######################################################################## 835 # Branch-less conditional subtraction of P 836 xor %eax, %eax 837 mov $acc0, $t0 838 sbb \$-1, $acc4 # .Lpoly[0] 839 sbb $poly1, $acc5 # .Lpoly[1] 840 sbb \$0, $acc0 # .Lpoly[2] 841 mov $acc1, $t1 842 sbb $poly3, $acc1 # .Lpoly[3] 843 sbb \$0, $acc2 844 845 cmovc $t2, $acc4 846 cmovc $t3, $acc5 847 mov $acc4, 8*0($r_ptr) 848 cmovc $t0, $acc0 849 mov $acc5, 8*1($r_ptr) 850 cmovc $t1, $acc1 851 mov $acc0, 8*2($r_ptr) 852 mov $acc1, 8*3($r_ptr) 853 854 ret 855.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 856 857.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 858.align 32 859__ecp_nistz256_sqr_montx: 860 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 861 mulx $acc7, $t0, $acc3 # a[0]*a[2] 862 xor %eax, %eax 863 adc $t0, $acc2 864 mulx $acc0, $t1, $acc4 # a[0]*a[3] 865 mov $acc6, %rdx 866 adc $t1, $acc3 867 adc \$0, $acc4 868 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 869 870 ################################# 871 mulx $acc7, $t0, $t1 # a[1]*a[2] 872 adcx $t0, $acc3 873 adox $t1, $acc4 874 875 mulx $acc0, $t0, $t1 # a[1]*a[3] 876 mov $acc7, %rdx 877 adcx $t0, $acc4 878 adox $t1, $acc5 879 adc \$0, $acc5 880 881 ################################# 882 mulx $acc0, $t0, $acc6 # a[2]*a[3] 883 mov 8*0+128($a_ptr), %rdx 884 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 885 adcx $acc1, $acc1 # acc1:6<<1 886 adox $t0, $acc5 887 adcx $acc2, $acc2 888 adox $acc7, $acc6 # of=0 889 890 mulx %rdx, $acc0, $t1 891 mov 8*1+128($a_ptr), %rdx 892 adcx $acc3, $acc3 893 adox $t1, $acc1 894 adcx $acc4, $acc4 895 mulx %rdx, $t0, $t4 896 mov 8*2+128($a_ptr), %rdx 897 adcx $acc5, $acc5 898 adox $t0, $acc2 899 adcx $acc6, $acc6 900 .byte 0x67 901 mulx %rdx, $t0, $t1 902 mov 8*3+128($a_ptr), %rdx 903 adox $t4, $acc3 904 adcx $acc7, $acc7 905 adox $t0, $acc4 906 mov \$32, $a_ptr 907 adox $t1, $acc5 908 .byte 0x67,0x67 909 mulx %rdx, $t0, $t4 910 mov $acc0, %rdx 911 adox $t0, $acc6 912 shlx $a_ptr, $acc0, $t0 913 adox $t4, $acc7 914 shrx $a_ptr, $acc0, $t4 915 mov .Lpoly+8*3(%rip), $t1 916 917 # reduction step 1 918 add $t0, $acc1 919 adc $t4, $acc2 920 921 mulx $t1, $t0, $acc0 922 mov $acc1, %rdx 923 adc $t0, $acc3 924 shlx $a_ptr, $acc1, $t0 925 adc \$0, $acc0 926 shrx $a_ptr, $acc1, $t4 927 928 # reduction step 2 929 add $t0, $acc2 930 adc $t4, $acc3 931 932 mulx $t1, $t0, $acc1 933 mov $acc2, %rdx 934 adc $t0, $acc0 935 shlx $a_ptr, $acc2, $t0 936 adc \$0, $acc1 937 shrx $a_ptr, $acc2, $t4 938 939 # reduction step 3 940 add $t0, $acc3 941 adc $t4, $acc0 942 943 mulx $t1, $t0, $acc2 944 mov $acc3, %rdx 945 adc $t0, $acc1 946 shlx $a_ptr, $acc3, $t0 947 adc \$0, $acc2 948 shrx $a_ptr, $acc3, $t4 949 950 # reduction step 4 951 add $t0, $acc0 952 adc $t4, $acc1 953 954 mulx $t1, $t0, $acc3 955 adc $t0, $acc2 956 adc \$0, $acc3 957 958 xor $t3, $t3 # cf=0 959 adc $acc0, $acc4 # accumulate upper half 960 mov .Lpoly+8*1(%rip), $a_ptr 961 adc $acc1, $acc5 962 mov $acc4, $acc0 963 adc $acc2, $acc6 964 adc $acc3, $acc7 965 mov $acc5, $acc1 966 adc \$0, $t3 967 968 xor %eax, %eax # cf=0 969 sbb \$-1, $acc4 # .Lpoly[0] 970 mov $acc6, $acc2 971 sbb $a_ptr, $acc5 # .Lpoly[1] 972 sbb \$0, $acc6 # .Lpoly[2] 973 mov $acc7, $acc3 974 sbb $t1, $acc7 # .Lpoly[3] 975 sbb \$0, $t3 976 977 cmovc $acc0, $acc4 978 cmovc $acc1, $acc5 979 mov $acc4, 8*0($r_ptr) 980 cmovc $acc2, $acc6 981 mov $acc5, 8*1($r_ptr) 982 cmovc $acc3, $acc7 983 mov $acc6, 8*2($r_ptr) 984 mov $acc7, 8*3($r_ptr) 985 986 ret 987.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 988___ 989} 990} 991{ 992my ($r_ptr,$in_ptr)=("%rdi","%rsi"); 993my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); 994my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); 995 996$code.=<<___; 997################################################################################ 998# void ecp_nistz256_from_mont( 999# uint64_t res[4], 1000# uint64_t in[4]); 1001# This one performs Montgomery multiplication by 1, so we only need the reduction 1002 1003.globl ecp_nistz256_from_mont 1004.type ecp_nistz256_from_mont,\@function,2 1005.align 32 1006ecp_nistz256_from_mont: 1007 push %r12 1008 push %r13 1009 1010 mov 8*0($in_ptr), %rax 1011 mov .Lpoly+8*3(%rip), $t2 1012 mov 8*1($in_ptr), $acc1 1013 mov 8*2($in_ptr), $acc2 1014 mov 8*3($in_ptr), $acc3 1015 mov %rax, $acc0 1016 mov .Lpoly+8*1(%rip), $t1 1017 1018 ######################################### 1019 # First iteration 1020 mov %rax, $t0 1021 shl \$32, $acc0 1022 mulq $t2 1023 shr \$32, $t0 1024 add $acc0, $acc1 1025 adc $t0, $acc2 1026 adc %rax, $acc3 1027 mov $acc1, %rax 1028 adc \$0, %rdx 1029 1030 ######################################### 1031 # Second iteration 1032 mov $acc1, $t0 1033 shl \$32, $acc1 1034 mov %rdx, $acc0 1035 mulq $t2 1036 shr \$32, $t0 1037 add $acc1, $acc2 1038 adc $t0, $acc3 1039 adc %rax, $acc0 1040 mov $acc2, %rax 1041 adc \$0, %rdx 1042 1043 ########################################## 1044 # Third iteration 1045 mov $acc2, $t0 1046 shl \$32, $acc2 1047 mov %rdx, $acc1 1048 mulq $t2 1049 shr \$32, $t0 1050 add $acc2, $acc3 1051 adc $t0, $acc0 1052 adc %rax, $acc1 1053 mov $acc3, %rax 1054 adc \$0, %rdx 1055 1056 ########################################### 1057 # Last iteration 1058 mov $acc3, $t0 1059 shl \$32, $acc3 1060 mov %rdx, $acc2 1061 mulq $t2 1062 shr \$32, $t0 1063 add $acc3, $acc0 1064 adc $t0, $acc1 1065 mov $acc0, $t0 1066 adc %rax, $acc2 1067 mov $acc1, $in_ptr 1068 adc \$0, %rdx 1069 1070 sub \$-1, $acc0 1071 mov $acc2, %rax 1072 sbb $t1, $acc1 1073 sbb \$0, $acc2 1074 mov %rdx, $acc3 1075 sbb $t2, %rdx 1076 sbb $t2, $t2 1077 1078 cmovnz $t0, $acc0 1079 cmovnz $in_ptr, $acc1 1080 mov $acc0, 8*0($r_ptr) 1081 cmovnz %rax, $acc2 1082 mov $acc1, 8*1($r_ptr) 1083 cmovz %rdx, $acc3 1084 mov $acc2, 8*2($r_ptr) 1085 mov $acc3, 8*3($r_ptr) 1086 1087 pop %r13 1088 pop %r12 1089 ret 1090.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 1091___ 1092} 1093{ 1094my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1095my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 1096my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 1097my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 1098 1099$code.=<<___; 1100################################################################################ 1101# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1102.globl ecp_nistz256_select_w5 1103.type ecp_nistz256_select_w5,\@abi-omnipotent 1104.align 32 1105ecp_nistz256_select_w5: 1106___ 1107$code.=<<___ if ($avx>1); 1108 mov OPENSSL_ia32cap_P+8(%rip), %eax 1109 test \$`1<<5`, %eax 1110 jnz .Lavx2_select_w5 1111___ 1112$code.=<<___ if ($win64); 1113 lea -0x88(%rsp), %rax 1114.LSEH_begin_ecp_nistz256_select_w5: 1115 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1116 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1117 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1118 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1119 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1120 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1121 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1122 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1123 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1124 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1125 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1126___ 1127$code.=<<___; 1128 movdqa .LOne(%rip), $ONE 1129 movd $index, $INDEX 1130 1131 pxor $Ra, $Ra 1132 pxor $Rb, $Rb 1133 pxor $Rc, $Rc 1134 pxor $Rd, $Rd 1135 pxor $Re, $Re 1136 pxor $Rf, $Rf 1137 1138 movdqa $ONE, $M0 1139 pshufd \$0, $INDEX, $INDEX 1140 1141 mov \$16, %rax 1142.Lselect_loop_sse_w5: 1143 1144 movdqa $M0, $TMP0 1145 paddd $ONE, $M0 1146 pcmpeqd $INDEX, $TMP0 1147 1148 movdqa 16*0($in_t), $T0a 1149 movdqa 16*1($in_t), $T0b 1150 movdqa 16*2($in_t), $T0c 1151 movdqa 16*3($in_t), $T0d 1152 movdqa 16*4($in_t), $T0e 1153 movdqa 16*5($in_t), $T0f 1154 lea 16*6($in_t), $in_t 1155 1156 pand $TMP0, $T0a 1157 pand $TMP0, $T0b 1158 por $T0a, $Ra 1159 pand $TMP0, $T0c 1160 por $T0b, $Rb 1161 pand $TMP0, $T0d 1162 por $T0c, $Rc 1163 pand $TMP0, $T0e 1164 por $T0d, $Rd 1165 pand $TMP0, $T0f 1166 por $T0e, $Re 1167 por $T0f, $Rf 1168 1169 dec %rax 1170 jnz .Lselect_loop_sse_w5 1171 1172 movdqu $Ra, 16*0($val) 1173 movdqu $Rb, 16*1($val) 1174 movdqu $Rc, 16*2($val) 1175 movdqu $Rd, 16*3($val) 1176 movdqu $Re, 16*4($val) 1177 movdqu $Rf, 16*5($val) 1178___ 1179$code.=<<___ if ($win64); 1180 movaps (%rsp), %xmm6 1181 movaps 0x10(%rsp), %xmm7 1182 movaps 0x20(%rsp), %xmm8 1183 movaps 0x30(%rsp), %xmm9 1184 movaps 0x40(%rsp), %xmm10 1185 movaps 0x50(%rsp), %xmm11 1186 movaps 0x60(%rsp), %xmm12 1187 movaps 0x70(%rsp), %xmm13 1188 movaps 0x80(%rsp), %xmm14 1189 movaps 0x90(%rsp), %xmm15 1190 lea 0xa8(%rsp), %rsp 1191.LSEH_end_ecp_nistz256_select_w5: 1192___ 1193$code.=<<___; 1194 ret 1195.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1196 1197################################################################################ 1198# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1199.globl ecp_nistz256_select_w7 1200.type ecp_nistz256_select_w7,\@abi-omnipotent 1201.align 32 1202ecp_nistz256_select_w7: 1203___ 1204$code.=<<___ if ($avx>1); 1205 mov OPENSSL_ia32cap_P+8(%rip), %eax 1206 test \$`1<<5`, %eax 1207 jnz .Lavx2_select_w7 1208___ 1209$code.=<<___ if ($win64); 1210 lea -0x88(%rsp), %rax 1211.LSEH_begin_ecp_nistz256_select_w7: 1212 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1213 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1214 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1215 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1216 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1217 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1218 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1219 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1220 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1221 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1222 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1223___ 1224$code.=<<___; 1225 movdqa .LOne(%rip), $M0 1226 movd $index, $INDEX 1227 1228 pxor $Ra, $Ra 1229 pxor $Rb, $Rb 1230 pxor $Rc, $Rc 1231 pxor $Rd, $Rd 1232 1233 movdqa $M0, $ONE 1234 pshufd \$0, $INDEX, $INDEX 1235 mov \$64, %rax 1236 1237.Lselect_loop_sse_w7: 1238 movdqa $M0, $TMP0 1239 paddd $ONE, $M0 1240 movdqa 16*0($in_t), $T0a 1241 movdqa 16*1($in_t), $T0b 1242 pcmpeqd $INDEX, $TMP0 1243 movdqa 16*2($in_t), $T0c 1244 movdqa 16*3($in_t), $T0d 1245 lea 16*4($in_t), $in_t 1246 1247 pand $TMP0, $T0a 1248 pand $TMP0, $T0b 1249 por $T0a, $Ra 1250 pand $TMP0, $T0c 1251 por $T0b, $Rb 1252 pand $TMP0, $T0d 1253 por $T0c, $Rc 1254 prefetcht0 255($in_t) 1255 por $T0d, $Rd 1256 1257 dec %rax 1258 jnz .Lselect_loop_sse_w7 1259 1260 movdqu $Ra, 16*0($val) 1261 movdqu $Rb, 16*1($val) 1262 movdqu $Rc, 16*2($val) 1263 movdqu $Rd, 16*3($val) 1264___ 1265$code.=<<___ if ($win64); 1266 movaps (%rsp), %xmm6 1267 movaps 0x10(%rsp), %xmm7 1268 movaps 0x20(%rsp), %xmm8 1269 movaps 0x30(%rsp), %xmm9 1270 movaps 0x40(%rsp), %xmm10 1271 movaps 0x50(%rsp), %xmm11 1272 movaps 0x60(%rsp), %xmm12 1273 movaps 0x70(%rsp), %xmm13 1274 movaps 0x80(%rsp), %xmm14 1275 movaps 0x90(%rsp), %xmm15 1276 lea 0xa8(%rsp), %rsp 1277.LSEH_end_ecp_nistz256_select_w7: 1278___ 1279$code.=<<___; 1280 ret 1281.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1282___ 1283} 1284if ($avx>1) { 1285my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1286my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 1287my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 1288my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 1289 1290$code.=<<___; 1291################################################################################ 1292# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 1293.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 1294.align 32 1295ecp_nistz256_avx2_select_w5: 1296.Lavx2_select_w5: 1297 vzeroupper 1298___ 1299$code.=<<___ if ($win64); 1300 lea -0x88(%rsp), %rax 1301.LSEH_begin_ecp_nistz256_avx2_select_w5: 1302 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1303 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1304 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1305 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1306 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1307 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1308 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1309 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1310 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1311 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1312 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1313___ 1314$code.=<<___; 1315 vmovdqa .LTwo(%rip), $TWO 1316 1317 vpxor $Ra, $Ra, $Ra 1318 vpxor $Rb, $Rb, $Rb 1319 vpxor $Rc, $Rc, $Rc 1320 1321 vmovdqa .LOne(%rip), $M0 1322 vmovdqa .LTwo(%rip), $M1 1323 1324 vmovd $index, %xmm1 1325 vpermd $INDEX, $Ra, $INDEX 1326 1327 mov \$8, %rax 1328.Lselect_loop_avx2_w5: 1329 1330 vmovdqa 32*0($in_t), $T0a 1331 vmovdqa 32*1($in_t), $T0b 1332 vmovdqa 32*2($in_t), $T0c 1333 1334 vmovdqa 32*3($in_t), $T1a 1335 vmovdqa 32*4($in_t), $T1b 1336 vmovdqa 32*5($in_t), $T1c 1337 1338 vpcmpeqd $INDEX, $M0, $TMP0 1339 vpcmpeqd $INDEX, $M1, $TMP1 1340 1341 vpaddd $TWO, $M0, $M0 1342 vpaddd $TWO, $M1, $M1 1343 lea 32*6($in_t), $in_t 1344 1345 vpand $TMP0, $T0a, $T0a 1346 vpand $TMP0, $T0b, $T0b 1347 vpand $TMP0, $T0c, $T0c 1348 vpand $TMP1, $T1a, $T1a 1349 vpand $TMP1, $T1b, $T1b 1350 vpand $TMP1, $T1c, $T1c 1351 1352 vpxor $T0a, $Ra, $Ra 1353 vpxor $T0b, $Rb, $Rb 1354 vpxor $T0c, $Rc, $Rc 1355 vpxor $T1a, $Ra, $Ra 1356 vpxor $T1b, $Rb, $Rb 1357 vpxor $T1c, $Rc, $Rc 1358 1359 dec %rax 1360 jnz .Lselect_loop_avx2_w5 1361 1362 vmovdqu $Ra, 32*0($val) 1363 vmovdqu $Rb, 32*1($val) 1364 vmovdqu $Rc, 32*2($val) 1365 vzeroupper 1366___ 1367$code.=<<___ if ($win64); 1368 movaps (%rsp), %xmm6 1369 movaps 0x10(%rsp), %xmm7 1370 movaps 0x20(%rsp), %xmm8 1371 movaps 0x30(%rsp), %xmm9 1372 movaps 0x40(%rsp), %xmm10 1373 movaps 0x50(%rsp), %xmm11 1374 movaps 0x60(%rsp), %xmm12 1375 movaps 0x70(%rsp), %xmm13 1376 movaps 0x80(%rsp), %xmm14 1377 movaps 0x90(%rsp), %xmm15 1378 lea 0xa8(%rsp), %rsp 1379.LSEH_end_ecp_nistz256_avx2_select_w5: 1380___ 1381$code.=<<___; 1382 ret 1383.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 1384___ 1385} 1386if ($avx>1) { 1387my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1388my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 1389my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 1390my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 1391my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 1392 1393$code.=<<___; 1394 1395################################################################################ 1396# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 1397.globl ecp_nistz256_avx2_select_w7 1398.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 1399.align 32 1400ecp_nistz256_avx2_select_w7: 1401.Lavx2_select_w7: 1402 vzeroupper 1403___ 1404$code.=<<___ if ($win64); 1405 lea -0x88(%rsp), %rax 1406.LSEH_begin_ecp_nistz256_avx2_select_w7: 1407 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1408 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1409 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1410 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1411 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1412 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1413 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1414 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1415 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1416 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1417 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1418___ 1419$code.=<<___; 1420 vmovdqa .LThree(%rip), $THREE 1421 1422 vpxor $Ra, $Ra, $Ra 1423 vpxor $Rb, $Rb, $Rb 1424 1425 vmovdqa .LOne(%rip), $M0 1426 vmovdqa .LTwo(%rip), $M1 1427 vmovdqa .LThree(%rip), $M2 1428 1429 vmovd $index, %xmm1 1430 vpermd $INDEX, $Ra, $INDEX 1431 # Skip index = 0, because it is implicitly the point at infinity 1432 1433 mov \$21, %rax 1434.Lselect_loop_avx2_w7: 1435 1436 vmovdqa 32*0($in_t), $T0a 1437 vmovdqa 32*1($in_t), $T0b 1438 1439 vmovdqa 32*2($in_t), $T1a 1440 vmovdqa 32*3($in_t), $T1b 1441 1442 vmovdqa 32*4($in_t), $T2a 1443 vmovdqa 32*5($in_t), $T2b 1444 1445 vpcmpeqd $INDEX, $M0, $TMP0 1446 vpcmpeqd $INDEX, $M1, $TMP1 1447 vpcmpeqd $INDEX, $M2, $TMP2 1448 1449 vpaddd $THREE, $M0, $M0 1450 vpaddd $THREE, $M1, $M1 1451 vpaddd $THREE, $M2, $M2 1452 lea 32*6($in_t), $in_t 1453 1454 vpand $TMP0, $T0a, $T0a 1455 vpand $TMP0, $T0b, $T0b 1456 vpand $TMP1, $T1a, $T1a 1457 vpand $TMP1, $T1b, $T1b 1458 vpand $TMP2, $T2a, $T2a 1459 vpand $TMP2, $T2b, $T2b 1460 1461 vpxor $T0a, $Ra, $Ra 1462 vpxor $T0b, $Rb, $Rb 1463 vpxor $T1a, $Ra, $Ra 1464 vpxor $T1b, $Rb, $Rb 1465 vpxor $T2a, $Ra, $Ra 1466 vpxor $T2b, $Rb, $Rb 1467 1468 dec %rax 1469 jnz .Lselect_loop_avx2_w7 1470 1471 1472 vmovdqa 32*0($in_t), $T0a 1473 vmovdqa 32*1($in_t), $T0b 1474 1475 vpcmpeqd $INDEX, $M0, $TMP0 1476 1477 vpand $TMP0, $T0a, $T0a 1478 vpand $TMP0, $T0b, $T0b 1479 1480 vpxor $T0a, $Ra, $Ra 1481 vpxor $T0b, $Rb, $Rb 1482 1483 vmovdqu $Ra, 32*0($val) 1484 vmovdqu $Rb, 32*1($val) 1485 vzeroupper 1486___ 1487$code.=<<___ if ($win64); 1488 movaps (%rsp), %xmm6 1489 movaps 0x10(%rsp), %xmm7 1490 movaps 0x20(%rsp), %xmm8 1491 movaps 0x30(%rsp), %xmm9 1492 movaps 0x40(%rsp), %xmm10 1493 movaps 0x50(%rsp), %xmm11 1494 movaps 0x60(%rsp), %xmm12 1495 movaps 0x70(%rsp), %xmm13 1496 movaps 0x80(%rsp), %xmm14 1497 movaps 0x90(%rsp), %xmm15 1498 lea 0xa8(%rsp), %rsp 1499.LSEH_end_ecp_nistz256_avx2_select_w7: 1500___ 1501$code.=<<___; 1502 ret 1503.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1504___ 1505} else { 1506$code.=<<___; 1507.globl ecp_nistz256_avx2_select_w7 1508.type ecp_nistz256_avx2_select_w7,\@function,3 1509.align 32 1510ecp_nistz256_avx2_select_w7: 1511 .byte 0x0f,0x0b # ud2 1512 ret 1513.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1514___ 1515} 1516{{{ 1517######################################################################## 1518# This block implements higher level point_double, point_add and 1519# point_add_affine. The key to performance in this case is to allow 1520# out-of-order execution logic to overlap computations from next step 1521# with tail processing from current step. By using tailored calling 1522# sequence we minimize inter-step overhead to give processor better 1523# shot at overlapping operations... 1524# 1525# You will notice that input data is copied to stack. Trouble is that 1526# there are no registers to spare for holding original pointers and 1527# reloading them, pointers, would create undesired dependencies on 1528# effective addresses calculation paths. In other words it's too done 1529# to favour out-of-order execution logic. 1530# <appro@openssl.org> 1531 1532my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 1533my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 1534my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 1535my ($poly1,$poly3)=($acc6,$acc7); 1536 1537sub load_for_mul () { 1538my ($a,$b,$src0) = @_; 1539my $bias = $src0 eq "%rax" ? 0 : -128; 1540 1541" mov $b, $src0 1542 lea $b, $b_ptr 1543 mov 8*0+$a, $acc1 1544 mov 8*1+$a, $acc2 1545 lea $bias+$a, $a_ptr 1546 mov 8*2+$a, $acc3 1547 mov 8*3+$a, $acc4" 1548} 1549 1550sub load_for_sqr () { 1551my ($a,$src0) = @_; 1552my $bias = $src0 eq "%rax" ? 0 : -128; 1553 1554" mov 8*0+$a, $src0 1555 mov 8*1+$a, $acc6 1556 lea $bias+$a, $a_ptr 1557 mov 8*2+$a, $acc7 1558 mov 8*3+$a, $acc0" 1559} 1560 1561 { 1562######################################################################## 1563# operate in 4-5-0-1 "name space" that matches multiplication output 1564# 1565my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1566 1567$code.=<<___; 1568.type __ecp_nistz256_add_toq,\@abi-omnipotent 1569.align 32 1570__ecp_nistz256_add_toq: 1571 add 8*0($b_ptr), $a0 1572 adc 8*1($b_ptr), $a1 1573 mov $a0, $t0 1574 adc 8*2($b_ptr), $a2 1575 adc 8*3($b_ptr), $a3 1576 mov $a1, $t1 1577 sbb $t4, $t4 1578 1579 sub \$-1, $a0 1580 mov $a2, $t2 1581 sbb $poly1, $a1 1582 sbb \$0, $a2 1583 mov $a3, $t3 1584 sbb $poly3, $a3 1585 test $t4, $t4 1586 1587 cmovz $t0, $a0 1588 cmovz $t1, $a1 1589 mov $a0, 8*0($r_ptr) 1590 cmovz $t2, $a2 1591 mov $a1, 8*1($r_ptr) 1592 cmovz $t3, $a3 1593 mov $a2, 8*2($r_ptr) 1594 mov $a3, 8*3($r_ptr) 1595 1596 ret 1597.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 1598 1599.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 1600.align 32 1601__ecp_nistz256_sub_fromq: 1602 sub 8*0($b_ptr), $a0 1603 sbb 8*1($b_ptr), $a1 1604 mov $a0, $t0 1605 sbb 8*2($b_ptr), $a2 1606 sbb 8*3($b_ptr), $a3 1607 mov $a1, $t1 1608 sbb $t4, $t4 1609 1610 add \$-1, $a0 1611 mov $a2, $t2 1612 adc $poly1, $a1 1613 adc \$0, $a2 1614 mov $a3, $t3 1615 adc $poly3, $a3 1616 test $t4, $t4 1617 1618 cmovz $t0, $a0 1619 cmovz $t1, $a1 1620 mov $a0, 8*0($r_ptr) 1621 cmovz $t2, $a2 1622 mov $a1, 8*1($r_ptr) 1623 cmovz $t3, $a3 1624 mov $a2, 8*2($r_ptr) 1625 mov $a3, 8*3($r_ptr) 1626 1627 ret 1628.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 1629 1630.type __ecp_nistz256_subq,\@abi-omnipotent 1631.align 32 1632__ecp_nistz256_subq: 1633 sub $a0, $t0 1634 sbb $a1, $t1 1635 mov $t0, $a0 1636 sbb $a2, $t2 1637 sbb $a3, $t3 1638 mov $t1, $a1 1639 sbb $t4, $t4 1640 1641 add \$-1, $t0 1642 mov $t2, $a2 1643 adc $poly1, $t1 1644 adc \$0, $t2 1645 mov $t3, $a3 1646 adc $poly3, $t3 1647 test $t4, $t4 1648 1649 cmovnz $t0, $a0 1650 cmovnz $t1, $a1 1651 cmovnz $t2, $a2 1652 cmovnz $t3, $a3 1653 1654 ret 1655.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 1656 1657.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 1658.align 32 1659__ecp_nistz256_mul_by_2q: 1660 add $a0, $a0 # a0:a3+a0:a3 1661 adc $a1, $a1 1662 mov $a0, $t0 1663 adc $a2, $a2 1664 adc $a3, $a3 1665 mov $a1, $t1 1666 sbb $t4, $t4 1667 1668 sub \$-1, $a0 1669 mov $a2, $t2 1670 sbb $poly1, $a1 1671 sbb \$0, $a2 1672 mov $a3, $t3 1673 sbb $poly3, $a3 1674 test $t4, $t4 1675 1676 cmovz $t0, $a0 1677 cmovz $t1, $a1 1678 mov $a0, 8*0($r_ptr) 1679 cmovz $t2, $a2 1680 mov $a1, 8*1($r_ptr) 1681 cmovz $t3, $a3 1682 mov $a2, 8*2($r_ptr) 1683 mov $a3, 8*3($r_ptr) 1684 1685 ret 1686.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 1687___ 1688 } 1689sub gen_double () { 1690 my $x = shift; 1691 my ($src0,$sfx,$bias); 1692 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1693 1694 if ($x ne "x") { 1695 $src0 = "%rax"; 1696 $sfx = ""; 1697 $bias = 0; 1698 1699$code.=<<___; 1700.globl ecp_nistz256_point_double 1701.type ecp_nistz256_point_double,\@function,2 1702.align 32 1703ecp_nistz256_point_double: 1704___ 1705$code.=<<___ if ($addx); 1706 mov \$0x80100, %ecx 1707 and OPENSSL_ia32cap_P+8(%rip), %ecx 1708 cmp \$0x80100, %ecx 1709 je .Lpoint_doublex 1710___ 1711 } else { 1712 $src0 = "%rdx"; 1713 $sfx = "x"; 1714 $bias = 128; 1715 1716$code.=<<___; 1717.type ecp_nistz256_point_doublex,\@function,2 1718.align 32 1719ecp_nistz256_point_doublex: 1720.Lpoint_doublex: 1721___ 1722 } 1723$code.=<<___; 1724 push %rbp 1725 push %rbx 1726 push %r12 1727 push %r13 1728 push %r14 1729 push %r15 1730 sub \$32*5+8, %rsp 1731 1732 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 1733 mov $a_ptr, $b_ptr # backup copy 1734 movdqu 0x10($a_ptr), %xmm1 1735 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 1736 mov 0x20+8*1($a_ptr), $acc5 1737 mov 0x20+8*2($a_ptr), $acc0 1738 mov 0x20+8*3($a_ptr), $acc1 1739 mov .Lpoly+8*1(%rip), $poly1 1740 mov .Lpoly+8*3(%rip), $poly3 1741 movdqa %xmm0, $in_x(%rsp) 1742 movdqa %xmm1, $in_x+0x10(%rsp) 1743 lea 0x20($r_ptr), $acc2 1744 lea 0x40($r_ptr), $acc3 1745 movq $r_ptr, %xmm0 1746 movq $acc2, %xmm1 1747 movq $acc3, %xmm2 1748 1749 lea $S(%rsp), $r_ptr 1750 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 1751 1752 mov 0x40+8*0($a_ptr), $src0 1753 mov 0x40+8*1($a_ptr), $acc6 1754 mov 0x40+8*2($a_ptr), $acc7 1755 mov 0x40+8*3($a_ptr), $acc0 1756 lea 0x40-$bias($a_ptr), $a_ptr 1757 lea $Zsqr(%rsp), $r_ptr 1758 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 1759 1760 `&load_for_sqr("$S(%rsp)", "$src0")` 1761 lea $S(%rsp), $r_ptr 1762 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 1763 1764 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 1765 mov 0x40+8*0($b_ptr), $acc1 1766 mov 0x40+8*1($b_ptr), $acc2 1767 mov 0x40+8*2($b_ptr), $acc3 1768 mov 0x40+8*3($b_ptr), $acc4 1769 lea 0x40-$bias($b_ptr), $a_ptr 1770 lea 0x20($b_ptr), $b_ptr 1771 movq %xmm2, $r_ptr 1772 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 1773 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 1774 1775 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 1776 mov $in_x+8*1(%rsp), $acc5 1777 lea $Zsqr(%rsp), $b_ptr 1778 mov $in_x+8*2(%rsp), $acc0 1779 mov $in_x+8*3(%rsp), $acc1 1780 lea $M(%rsp), $r_ptr 1781 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 1782 1783 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 1784 mov $in_x+8*1(%rsp), $acc5 1785 lea $Zsqr(%rsp), $b_ptr 1786 mov $in_x+8*2(%rsp), $acc0 1787 mov $in_x+8*3(%rsp), $acc1 1788 lea $Zsqr(%rsp), $r_ptr 1789 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 1790 1791 `&load_for_sqr("$S(%rsp)", "$src0")` 1792 movq %xmm1, $r_ptr 1793 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 1794___ 1795{ 1796######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 1797# operate in 4-5-6-7 "name space" that matches squaring output 1798# 1799my ($poly1,$poly3)=($a_ptr,$t1); 1800my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 1801 1802$code.=<<___; 1803 xor $t4, $t4 1804 mov $a0, $t0 1805 add \$-1, $a0 1806 mov $a1, $t1 1807 adc $poly1, $a1 1808 mov $a2, $t2 1809 adc \$0, $a2 1810 mov $a3, $t3 1811 adc $poly3, $a3 1812 adc \$0, $t4 1813 xor $a_ptr, $a_ptr # borrow $a_ptr 1814 test \$1, $t0 1815 1816 cmovz $t0, $a0 1817 cmovz $t1, $a1 1818 cmovz $t2, $a2 1819 cmovz $t3, $a3 1820 cmovz $a_ptr, $t4 1821 1822 mov $a1, $t0 # a0:a3>>1 1823 shr \$1, $a0 1824 shl \$63, $t0 1825 mov $a2, $t1 1826 shr \$1, $a1 1827 or $t0, $a0 1828 shl \$63, $t1 1829 mov $a3, $t2 1830 shr \$1, $a2 1831 or $t1, $a1 1832 shl \$63, $t2 1833 mov $a0, 8*0($r_ptr) 1834 shr \$1, $a3 1835 mov $a1, 8*1($r_ptr) 1836 shl \$63, $t4 1837 or $t2, $a2 1838 or $t4, $a3 1839 mov $a2, 8*2($r_ptr) 1840 mov $a3, 8*3($r_ptr) 1841___ 1842} 1843$code.=<<___; 1844 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 1845 lea $M(%rsp), $r_ptr 1846 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 1847 1848 lea $tmp0(%rsp), $r_ptr 1849 call __ecp_nistz256_mul_by_2$x 1850 1851 lea $M(%rsp), $b_ptr 1852 lea $M(%rsp), $r_ptr 1853 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 1854 1855 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 1856 lea $S(%rsp), $r_ptr 1857 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 1858 1859 lea $tmp0(%rsp), $r_ptr 1860 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 1861 1862 `&load_for_sqr("$M(%rsp)", "$src0")` 1863 movq %xmm0, $r_ptr 1864 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 1865 1866 lea $tmp0(%rsp), $b_ptr 1867 mov $acc6, $acc0 # harmonize sqr output and sub input 1868 mov $acc7, $acc1 1869 mov $a_ptr, $poly1 1870 mov $t1, $poly3 1871 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 1872 1873 mov $S+8*0(%rsp), $t0 1874 mov $S+8*1(%rsp), $t1 1875 mov $S+8*2(%rsp), $t2 1876 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 1877 lea $S(%rsp), $r_ptr 1878 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 1879 1880 mov $M(%rsp), $src0 1881 lea $M(%rsp), $b_ptr 1882 mov $acc4, $acc6 # harmonize sub output and mul input 1883 xor %ecx, %ecx 1884 mov $acc4, $S+8*0(%rsp) # have to save:-( 1885 mov $acc5, $acc2 1886 mov $acc5, $S+8*1(%rsp) 1887 cmovz $acc0, $acc3 1888 mov $acc0, $S+8*2(%rsp) 1889 lea $S-$bias(%rsp), $a_ptr 1890 cmovz $acc1, $acc4 1891 mov $acc1, $S+8*3(%rsp) 1892 mov $acc6, $acc1 1893 lea $S(%rsp), $r_ptr 1894 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 1895 1896 movq %xmm1, $b_ptr 1897 movq %xmm1, $r_ptr 1898 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 1899 1900 add \$32*5+8, %rsp 1901 pop %r15 1902 pop %r14 1903 pop %r13 1904 pop %r12 1905 pop %rbx 1906 pop %rbp 1907 ret 1908.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 1909___ 1910} 1911&gen_double("q"); 1912 1913sub gen_add () { 1914 my $x = shift; 1915 my ($src0,$sfx,$bias); 1916 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 1917 $U1,$U2,$S1,$S2, 1918 $res_x,$res_y,$res_z, 1919 $in1_x,$in1_y,$in1_z, 1920 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 1921 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1922 1923 if ($x ne "x") { 1924 $src0 = "%rax"; 1925 $sfx = ""; 1926 $bias = 0; 1927 1928$code.=<<___; 1929.globl ecp_nistz256_point_add 1930.type ecp_nistz256_point_add,\@function,3 1931.align 32 1932ecp_nistz256_point_add: 1933___ 1934$code.=<<___ if ($addx); 1935 mov \$0x80100, %ecx 1936 and OPENSSL_ia32cap_P+8(%rip), %ecx 1937 cmp \$0x80100, %ecx 1938 je .Lpoint_addx 1939___ 1940 } else { 1941 $src0 = "%rdx"; 1942 $sfx = "x"; 1943 $bias = 128; 1944 1945$code.=<<___; 1946.type ecp_nistz256_point_addx,\@function,3 1947.align 32 1948ecp_nistz256_point_addx: 1949.Lpoint_addx: 1950___ 1951 } 1952$code.=<<___; 1953 push %rbp 1954 push %rbx 1955 push %r12 1956 push %r13 1957 push %r14 1958 push %r15 1959 sub \$32*18+8, %rsp 1960 1961 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 1962 movdqu 0x10($a_ptr), %xmm1 1963 movdqu 0x20($a_ptr), %xmm2 1964 movdqu 0x30($a_ptr), %xmm3 1965 movdqu 0x40($a_ptr), %xmm4 1966 movdqu 0x50($a_ptr), %xmm5 1967 mov $a_ptr, $b_ptr # reassign 1968 mov $b_org, $a_ptr # reassign 1969 movdqa %xmm0, $in1_x(%rsp) 1970 movdqa %xmm1, $in1_x+0x10(%rsp) 1971 por %xmm0, %xmm1 1972 movdqa %xmm2, $in1_y(%rsp) 1973 movdqa %xmm3, $in1_y+0x10(%rsp) 1974 por %xmm2, %xmm3 1975 movdqa %xmm4, $in1_z(%rsp) 1976 movdqa %xmm5, $in1_z+0x10(%rsp) 1977 por %xmm1, %xmm3 1978 1979 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 1980 pshufd \$0xb1, %xmm3, %xmm5 1981 movdqu 0x10($a_ptr), %xmm1 1982 movdqu 0x20($a_ptr), %xmm2 1983 por %xmm3, %xmm5 1984 movdqu 0x30($a_ptr), %xmm3 1985 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 1986 mov 0x40+8*1($a_ptr), $acc6 1987 mov 0x40+8*2($a_ptr), $acc7 1988 mov 0x40+8*3($a_ptr), $acc0 1989 movdqa %xmm0, $in2_x(%rsp) 1990 pshufd \$0x1e, %xmm5, %xmm4 1991 movdqa %xmm1, $in2_x+0x10(%rsp) 1992 por %xmm0, %xmm1 1993 movq $r_ptr, %xmm0 # save $r_ptr 1994 movdqa %xmm2, $in2_y(%rsp) 1995 movdqa %xmm3, $in2_y+0x10(%rsp) 1996 por %xmm2, %xmm3 1997 por %xmm4, %xmm5 1998 pxor %xmm4, %xmm4 1999 por %xmm1, %xmm3 2000 2001 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2002 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 2003 mov $acc6, $in2_z+8*1(%rsp) 2004 mov $acc7, $in2_z+8*2(%rsp) 2005 mov $acc0, $in2_z+8*3(%rsp) 2006 lea $Z2sqr(%rsp), $r_ptr # Z2^2 2007 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 2008 2009 pcmpeqd %xmm4, %xmm5 2010 pshufd \$0xb1, %xmm3, %xmm4 2011 por %xmm3, %xmm4 2012 pshufd \$0, %xmm5, %xmm5 # in1infty 2013 pshufd \$0x1e, %xmm4, %xmm3 2014 por %xmm3, %xmm4 2015 pxor %xmm3, %xmm3 2016 pcmpeqd %xmm3, %xmm4 2017 pshufd \$0, %xmm4, %xmm4 # in2infty 2018 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 2019 mov 0x40+8*1($b_ptr), $acc6 2020 mov 0x40+8*2($b_ptr), $acc7 2021 mov 0x40+8*3($b_ptr), $acc0 2022 2023 lea 0x40-$bias($b_ptr), $a_ptr 2024 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2025 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2026 2027 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 2028 lea $S1(%rsp), $r_ptr # S1 = Z2^3 2029 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 2030 2031 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2032 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2033 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2034 2035 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 2036 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 2037 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 2038 2039 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2040 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2041 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2042 2043 lea $S1(%rsp), $b_ptr 2044 lea $R(%rsp), $r_ptr # R = S2 - S1 2045 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 2046 2047 or $acc5, $acc4 # see if result is zero 2048 movdqa %xmm4, %xmm2 2049 or $acc0, $acc4 2050 or $acc1, $acc4 2051 por %xmm5, %xmm2 # in1infty || in2infty 2052 movq $acc4, %xmm3 2053 2054 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2055 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 2056 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 2057 2058 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 2059 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2060 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 2061 2062 lea $U1(%rsp), $b_ptr 2063 lea $H(%rsp), $r_ptr # H = U2 - U1 2064 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 2065 2066 or $acc5, $acc4 # see if result is zero 2067 or $acc0, $acc4 2068 or $acc1, $acc4 2069 2070 .byte 0x3e # predict taken 2071 jnz .Ladd_proceed$x # is_equal(U1,U2)? 2072 movq %xmm2, $acc0 2073 movq %xmm3, $acc1 2074 test $acc0, $acc0 2075 jnz .Ladd_proceed$x # (in1infty || in2infty)? 2076 test $acc1, $acc1 2077 jz .Ladd_proceed$x # is_equal(S1,S2)? 2078 2079 movq %xmm0, $r_ptr # restore $r_ptr 2080 pxor %xmm0, %xmm0 2081 movdqu %xmm0, 0x00($r_ptr) 2082 movdqu %xmm0, 0x10($r_ptr) 2083 movdqu %xmm0, 0x20($r_ptr) 2084 movdqu %xmm0, 0x30($r_ptr) 2085 movdqu %xmm0, 0x40($r_ptr) 2086 movdqu %xmm0, 0x50($r_ptr) 2087 jmp .Ladd_done$x 2088 2089.align 32 2090.Ladd_proceed$x: 2091 `&load_for_sqr("$R(%rsp)", "$src0")` 2092 lea $Rsqr(%rsp), $r_ptr # R^2 2093 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2094 2095 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2096 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2097 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2098 2099 `&load_for_sqr("$H(%rsp)", "$src0")` 2100 lea $Hsqr(%rsp), $r_ptr # H^2 2101 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2102 2103 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 2104 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2105 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 2106 2107 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 2108 lea $Hcub(%rsp), $r_ptr # H^3 2109 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2110 2111 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 2112 lea $U2(%rsp), $r_ptr # U1*H^2 2113 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 2114___ 2115{ 2116####################################################################### 2117# operate in 4-5-0-1 "name space" that matches multiplication output 2118# 2119my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2120my ($poly1, $poly3)=($acc6,$acc7); 2121 2122$code.=<<___; 2123 #lea $U2(%rsp), $a_ptr 2124 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2125 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2126 2127 add $acc0, $acc0 # a0:a3+a0:a3 2128 lea $Rsqr(%rsp), $a_ptr 2129 adc $acc1, $acc1 2130 mov $acc0, $t0 2131 adc $acc2, $acc2 2132 adc $acc3, $acc3 2133 mov $acc1, $t1 2134 sbb $t4, $t4 2135 2136 sub \$-1, $acc0 2137 mov $acc2, $t2 2138 sbb $poly1, $acc1 2139 sbb \$0, $acc2 2140 mov $acc3, $t3 2141 sbb $poly3, $acc3 2142 test $t4, $t4 2143 2144 cmovz $t0, $acc0 2145 mov 8*0($a_ptr), $t0 2146 cmovz $t1, $acc1 2147 mov 8*1($a_ptr), $t1 2148 cmovz $t2, $acc2 2149 mov 8*2($a_ptr), $t2 2150 cmovz $t3, $acc3 2151 mov 8*3($a_ptr), $t3 2152 2153 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2154 2155 lea $Hcub(%rsp), $b_ptr 2156 lea $res_x(%rsp), $r_ptr 2157 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2158 2159 mov $U2+8*0(%rsp), $t0 2160 mov $U2+8*1(%rsp), $t1 2161 mov $U2+8*2(%rsp), $t2 2162 mov $U2+8*3(%rsp), $t3 2163 lea $res_y(%rsp), $r_ptr 2164 2165 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 2166 2167 mov $acc0, 8*0($r_ptr) # save the result, as 2168 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2169 mov $acc2, 8*2($r_ptr) 2170 mov $acc3, 8*3($r_ptr) 2171___ 2172} 2173$code.=<<___; 2174 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 2175 lea $S2(%rsp), $r_ptr 2176 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 2177 2178 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 2179 lea $res_y(%rsp), $r_ptr 2180 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 2181 2182 lea $S2(%rsp), $b_ptr 2183 lea $res_y(%rsp), $r_ptr 2184 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 2185 2186 movq %xmm0, $r_ptr # restore $r_ptr 2187 2188 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 2189 movdqa %xmm5, %xmm1 2190 pandn $res_z(%rsp), %xmm0 2191 movdqa %xmm5, %xmm2 2192 pandn $res_z+0x10(%rsp), %xmm1 2193 movdqa %xmm5, %xmm3 2194 pand $in2_z(%rsp), %xmm2 2195 pand $in2_z+0x10(%rsp), %xmm3 2196 por %xmm0, %xmm2 2197 por %xmm1, %xmm3 2198 2199 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2200 movdqa %xmm4, %xmm1 2201 pandn %xmm2, %xmm0 2202 movdqa %xmm4, %xmm2 2203 pandn %xmm3, %xmm1 2204 movdqa %xmm4, %xmm3 2205 pand $in1_z(%rsp), %xmm2 2206 pand $in1_z+0x10(%rsp), %xmm3 2207 por %xmm0, %xmm2 2208 por %xmm1, %xmm3 2209 movdqu %xmm2, 0x40($r_ptr) 2210 movdqu %xmm3, 0x50($r_ptr) 2211 2212 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2213 movdqa %xmm5, %xmm1 2214 pandn $res_x(%rsp), %xmm0 2215 movdqa %xmm5, %xmm2 2216 pandn $res_x+0x10(%rsp), %xmm1 2217 movdqa %xmm5, %xmm3 2218 pand $in2_x(%rsp), %xmm2 2219 pand $in2_x+0x10(%rsp), %xmm3 2220 por %xmm0, %xmm2 2221 por %xmm1, %xmm3 2222 2223 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2224 movdqa %xmm4, %xmm1 2225 pandn %xmm2, %xmm0 2226 movdqa %xmm4, %xmm2 2227 pandn %xmm3, %xmm1 2228 movdqa %xmm4, %xmm3 2229 pand $in1_x(%rsp), %xmm2 2230 pand $in1_x+0x10(%rsp), %xmm3 2231 por %xmm0, %xmm2 2232 por %xmm1, %xmm3 2233 movdqu %xmm2, 0x00($r_ptr) 2234 movdqu %xmm3, 0x10($r_ptr) 2235 2236 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2237 movdqa %xmm5, %xmm1 2238 pandn $res_y(%rsp), %xmm0 2239 movdqa %xmm5, %xmm2 2240 pandn $res_y+0x10(%rsp), %xmm1 2241 movdqa %xmm5, %xmm3 2242 pand $in2_y(%rsp), %xmm2 2243 pand $in2_y+0x10(%rsp), %xmm3 2244 por %xmm0, %xmm2 2245 por %xmm1, %xmm3 2246 2247 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2248 movdqa %xmm4, %xmm1 2249 pandn %xmm2, %xmm0 2250 movdqa %xmm4, %xmm2 2251 pandn %xmm3, %xmm1 2252 movdqa %xmm4, %xmm3 2253 pand $in1_y(%rsp), %xmm2 2254 pand $in1_y+0x10(%rsp), %xmm3 2255 por %xmm0, %xmm2 2256 por %xmm1, %xmm3 2257 movdqu %xmm2, 0x20($r_ptr) 2258 movdqu %xmm3, 0x30($r_ptr) 2259 2260.Ladd_done$x: 2261 add \$32*18+8, %rsp 2262 pop %r15 2263 pop %r14 2264 pop %r13 2265 pop %r12 2266 pop %rbx 2267 pop %rbp 2268 ret 2269.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 2270___ 2271} 2272&gen_add("q"); 2273 2274sub gen_add_affine () { 2275 my $x = shift; 2276 my ($src0,$sfx,$bias); 2277 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 2278 $res_x,$res_y,$res_z, 2279 $in1_x,$in1_y,$in1_z, 2280 $in2_x,$in2_y)=map(32*$_,(0..14)); 2281 my $Z1sqr = $S2; 2282 2283 if ($x ne "x") { 2284 $src0 = "%rax"; 2285 $sfx = ""; 2286 $bias = 0; 2287 2288$code.=<<___; 2289.globl ecp_nistz256_point_add_affine 2290.type ecp_nistz256_point_add_affine,\@function,3 2291.align 32 2292ecp_nistz256_point_add_affine: 2293___ 2294$code.=<<___ if ($addx); 2295 mov \$0x80100, %ecx 2296 and OPENSSL_ia32cap_P+8(%rip), %ecx 2297 cmp \$0x80100, %ecx 2298 je .Lpoint_add_affinex 2299___ 2300 } else { 2301 $src0 = "%rdx"; 2302 $sfx = "x"; 2303 $bias = 128; 2304 2305$code.=<<___; 2306.type ecp_nistz256_point_add_affinex,\@function,3 2307.align 32 2308ecp_nistz256_point_add_affinex: 2309.Lpoint_add_affinex: 2310___ 2311 } 2312$code.=<<___; 2313 push %rbp 2314 push %rbx 2315 push %r12 2316 push %r13 2317 push %r14 2318 push %r15 2319 sub \$32*15+8, %rsp 2320 2321 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2322 mov $b_org, $b_ptr # reassign 2323 movdqu 0x10($a_ptr), %xmm1 2324 movdqu 0x20($a_ptr), %xmm2 2325 movdqu 0x30($a_ptr), %xmm3 2326 movdqu 0x40($a_ptr), %xmm4 2327 movdqu 0x50($a_ptr), %xmm5 2328 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 2329 mov 0x40+8*1($a_ptr), $acc6 2330 mov 0x40+8*2($a_ptr), $acc7 2331 mov 0x40+8*3($a_ptr), $acc0 2332 movdqa %xmm0, $in1_x(%rsp) 2333 movdqa %xmm1, $in1_x+0x10(%rsp) 2334 por %xmm0, %xmm1 2335 movdqa %xmm2, $in1_y(%rsp) 2336 movdqa %xmm3, $in1_y+0x10(%rsp) 2337 por %xmm2, %xmm3 2338 movdqa %xmm4, $in1_z(%rsp) 2339 movdqa %xmm5, $in1_z+0x10(%rsp) 2340 por %xmm1, %xmm3 2341 2342 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 2343 pshufd \$0xb1, %xmm3, %xmm5 2344 movdqu 0x10($b_ptr), %xmm1 2345 movdqu 0x20($b_ptr), %xmm2 2346 por %xmm3, %xmm5 2347 movdqu 0x30($b_ptr), %xmm3 2348 movdqa %xmm0, $in2_x(%rsp) 2349 pshufd \$0x1e, %xmm5, %xmm4 2350 movdqa %xmm1, $in2_x+0x10(%rsp) 2351 por %xmm0, %xmm1 2352 movq $r_ptr, %xmm0 # save $r_ptr 2353 movdqa %xmm2, $in2_y(%rsp) 2354 movdqa %xmm3, $in2_y+0x10(%rsp) 2355 por %xmm2, %xmm3 2356 por %xmm4, %xmm5 2357 pxor %xmm4, %xmm4 2358 por %xmm1, %xmm3 2359 2360 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2361 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2362 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2363 2364 pcmpeqd %xmm4, %xmm5 2365 pshufd \$0xb1, %xmm3, %xmm4 2366 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 2367 #lea 0x00($b_ptr), $b_ptr 2368 mov $acc4, $acc1 # harmonize sqr output and mul input 2369 por %xmm3, %xmm4 2370 pshufd \$0, %xmm5, %xmm5 # in1infty 2371 pshufd \$0x1e, %xmm4, %xmm3 2372 mov $acc5, $acc2 2373 por %xmm3, %xmm4 2374 pxor %xmm3, %xmm3 2375 mov $acc6, $acc3 2376 pcmpeqd %xmm3, %xmm4 2377 pshufd \$0, %xmm4, %xmm4 # in2infty 2378 2379 lea $Z1sqr-$bias(%rsp), $a_ptr 2380 mov $acc7, $acc4 2381 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2382 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 2383 2384 lea $in1_x(%rsp), $b_ptr 2385 lea $H(%rsp), $r_ptr # H = U2 - U1 2386 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 2387 2388 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2389 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2390 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2391 2392 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2393 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2394 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2395 2396 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2397 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2398 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2399 2400 lea $in1_y(%rsp), $b_ptr 2401 lea $R(%rsp), $r_ptr # R = S2 - S1 2402 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 2403 2404 `&load_for_sqr("$H(%rsp)", "$src0")` 2405 lea $Hsqr(%rsp), $r_ptr # H^2 2406 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2407 2408 `&load_for_sqr("$R(%rsp)", "$src0")` 2409 lea $Rsqr(%rsp), $r_ptr # R^2 2410 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2411 2412 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 2413 lea $Hcub(%rsp), $r_ptr # H^3 2414 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2415 2416 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2417 lea $U2(%rsp), $r_ptr # U1*H^2 2418 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 2419___ 2420{ 2421####################################################################### 2422# operate in 4-5-0-1 "name space" that matches multiplication output 2423# 2424my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2425my ($poly1, $poly3)=($acc6,$acc7); 2426 2427$code.=<<___; 2428 #lea $U2(%rsp), $a_ptr 2429 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2430 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2431 2432 add $acc0, $acc0 # a0:a3+a0:a3 2433 lea $Rsqr(%rsp), $a_ptr 2434 adc $acc1, $acc1 2435 mov $acc0, $t0 2436 adc $acc2, $acc2 2437 adc $acc3, $acc3 2438 mov $acc1, $t1 2439 sbb $t4, $t4 2440 2441 sub \$-1, $acc0 2442 mov $acc2, $t2 2443 sbb $poly1, $acc1 2444 sbb \$0, $acc2 2445 mov $acc3, $t3 2446 sbb $poly3, $acc3 2447 test $t4, $t4 2448 2449 cmovz $t0, $acc0 2450 mov 8*0($a_ptr), $t0 2451 cmovz $t1, $acc1 2452 mov 8*1($a_ptr), $t1 2453 cmovz $t2, $acc2 2454 mov 8*2($a_ptr), $t2 2455 cmovz $t3, $acc3 2456 mov 8*3($a_ptr), $t3 2457 2458 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2459 2460 lea $Hcub(%rsp), $b_ptr 2461 lea $res_x(%rsp), $r_ptr 2462 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2463 2464 mov $U2+8*0(%rsp), $t0 2465 mov $U2+8*1(%rsp), $t1 2466 mov $U2+8*2(%rsp), $t2 2467 mov $U2+8*3(%rsp), $t3 2468 lea $H(%rsp), $r_ptr 2469 2470 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 2471 2472 mov $acc0, 8*0($r_ptr) # save the result, as 2473 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2474 mov $acc2, 8*2($r_ptr) 2475 mov $acc3, 8*3($r_ptr) 2476___ 2477} 2478$code.=<<___; 2479 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 2480 lea $S2(%rsp), $r_ptr 2481 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 2482 2483 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 2484 lea $H(%rsp), $r_ptr 2485 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 2486 2487 lea $S2(%rsp), $b_ptr 2488 lea $res_y(%rsp), $r_ptr 2489 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 2490 2491 movq %xmm0, $r_ptr # restore $r_ptr 2492 2493 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 2494 movdqa %xmm5, %xmm1 2495 pandn $res_z(%rsp), %xmm0 2496 movdqa %xmm5, %xmm2 2497 pandn $res_z+0x10(%rsp), %xmm1 2498 movdqa %xmm5, %xmm3 2499 pand .LONE_mont(%rip), %xmm2 2500 pand .LONE_mont+0x10(%rip), %xmm3 2501 por %xmm0, %xmm2 2502 por %xmm1, %xmm3 2503 2504 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2505 movdqa %xmm4, %xmm1 2506 pandn %xmm2, %xmm0 2507 movdqa %xmm4, %xmm2 2508 pandn %xmm3, %xmm1 2509 movdqa %xmm4, %xmm3 2510 pand $in1_z(%rsp), %xmm2 2511 pand $in1_z+0x10(%rsp), %xmm3 2512 por %xmm0, %xmm2 2513 por %xmm1, %xmm3 2514 movdqu %xmm2, 0x40($r_ptr) 2515 movdqu %xmm3, 0x50($r_ptr) 2516 2517 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2518 movdqa %xmm5, %xmm1 2519 pandn $res_x(%rsp), %xmm0 2520 movdqa %xmm5, %xmm2 2521 pandn $res_x+0x10(%rsp), %xmm1 2522 movdqa %xmm5, %xmm3 2523 pand $in2_x(%rsp), %xmm2 2524 pand $in2_x+0x10(%rsp), %xmm3 2525 por %xmm0, %xmm2 2526 por %xmm1, %xmm3 2527 2528 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2529 movdqa %xmm4, %xmm1 2530 pandn %xmm2, %xmm0 2531 movdqa %xmm4, %xmm2 2532 pandn %xmm3, %xmm1 2533 movdqa %xmm4, %xmm3 2534 pand $in1_x(%rsp), %xmm2 2535 pand $in1_x+0x10(%rsp), %xmm3 2536 por %xmm0, %xmm2 2537 por %xmm1, %xmm3 2538 movdqu %xmm2, 0x00($r_ptr) 2539 movdqu %xmm3, 0x10($r_ptr) 2540 2541 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2542 movdqa %xmm5, %xmm1 2543 pandn $res_y(%rsp), %xmm0 2544 movdqa %xmm5, %xmm2 2545 pandn $res_y+0x10(%rsp), %xmm1 2546 movdqa %xmm5, %xmm3 2547 pand $in2_y(%rsp), %xmm2 2548 pand $in2_y+0x10(%rsp), %xmm3 2549 por %xmm0, %xmm2 2550 por %xmm1, %xmm3 2551 2552 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2553 movdqa %xmm4, %xmm1 2554 pandn %xmm2, %xmm0 2555 movdqa %xmm4, %xmm2 2556 pandn %xmm3, %xmm1 2557 movdqa %xmm4, %xmm3 2558 pand $in1_y(%rsp), %xmm2 2559 pand $in1_y+0x10(%rsp), %xmm3 2560 por %xmm0, %xmm2 2561 por %xmm1, %xmm3 2562 movdqu %xmm2, 0x20($r_ptr) 2563 movdqu %xmm3, 0x30($r_ptr) 2564 2565 add \$32*15+8, %rsp 2566 pop %r15 2567 pop %r14 2568 pop %r13 2569 pop %r12 2570 pop %rbx 2571 pop %rbp 2572 ret 2573.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 2574___ 2575} 2576&gen_add_affine("q"); 2577 2578######################################################################## 2579# AD*X magic 2580# 2581if ($addx) { { 2582######################################################################## 2583# operate in 4-5-0-1 "name space" that matches multiplication output 2584# 2585my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2586 2587$code.=<<___; 2588.type __ecp_nistz256_add_tox,\@abi-omnipotent 2589.align 32 2590__ecp_nistz256_add_tox: 2591 xor $t4, $t4 2592 adc 8*0($b_ptr), $a0 2593 adc 8*1($b_ptr), $a1 2594 mov $a0, $t0 2595 adc 8*2($b_ptr), $a2 2596 adc 8*3($b_ptr), $a3 2597 mov $a1, $t1 2598 adc \$0, $t4 2599 2600 xor $t3, $t3 2601 sbb \$-1, $a0 2602 mov $a2, $t2 2603 sbb $poly1, $a1 2604 sbb \$0, $a2 2605 mov $a3, $t3 2606 sbb $poly3, $a3 2607 2608 bt \$0, $t4 2609 cmovnc $t0, $a0 2610 cmovnc $t1, $a1 2611 mov $a0, 8*0($r_ptr) 2612 cmovnc $t2, $a2 2613 mov $a1, 8*1($r_ptr) 2614 cmovnc $t3, $a3 2615 mov $a2, 8*2($r_ptr) 2616 mov $a3, 8*3($r_ptr) 2617 2618 ret 2619.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 2620 2621.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 2622.align 32 2623__ecp_nistz256_sub_fromx: 2624 xor $t4, $t4 2625 sbb 8*0($b_ptr), $a0 2626 sbb 8*1($b_ptr), $a1 2627 mov $a0, $t0 2628 sbb 8*2($b_ptr), $a2 2629 sbb 8*3($b_ptr), $a3 2630 mov $a1, $t1 2631 sbb \$0, $t4 2632 2633 xor $t3, $t3 2634 adc \$-1, $a0 2635 mov $a2, $t2 2636 adc $poly1, $a1 2637 adc \$0, $a2 2638 mov $a3, $t3 2639 adc $poly3, $a3 2640 2641 bt \$0, $t4 2642 cmovnc $t0, $a0 2643 cmovnc $t1, $a1 2644 mov $a0, 8*0($r_ptr) 2645 cmovnc $t2, $a2 2646 mov $a1, 8*1($r_ptr) 2647 cmovnc $t3, $a3 2648 mov $a2, 8*2($r_ptr) 2649 mov $a3, 8*3($r_ptr) 2650 2651 ret 2652.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 2653 2654.type __ecp_nistz256_subx,\@abi-omnipotent 2655.align 32 2656__ecp_nistz256_subx: 2657 xor $t4, $t4 2658 sbb $a0, $t0 2659 sbb $a1, $t1 2660 mov $t0, $a0 2661 sbb $a2, $t2 2662 sbb $a3, $t3 2663 mov $t1, $a1 2664 sbb \$0, $t4 2665 2666 xor $a3 ,$a3 2667 adc \$-1, $t0 2668 mov $t2, $a2 2669 adc $poly1, $t1 2670 adc \$0, $t2 2671 mov $t3, $a3 2672 adc $poly3, $t3 2673 2674 bt \$0, $t4 2675 cmovc $t0, $a0 2676 cmovc $t1, $a1 2677 cmovc $t2, $a2 2678 cmovc $t3, $a3 2679 2680 ret 2681.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 2682 2683.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 2684.align 32 2685__ecp_nistz256_mul_by_2x: 2686 xor $t4, $t4 2687 adc $a0, $a0 # a0:a3+a0:a3 2688 adc $a1, $a1 2689 mov $a0, $t0 2690 adc $a2, $a2 2691 adc $a3, $a3 2692 mov $a1, $t1 2693 adc \$0, $t4 2694 2695 xor $t3, $t3 2696 sbb \$-1, $a0 2697 mov $a2, $t2 2698 sbb $poly1, $a1 2699 sbb \$0, $a2 2700 mov $a3, $t3 2701 sbb $poly3, $a3 2702 2703 bt \$0, $t4 2704 cmovnc $t0, $a0 2705 cmovnc $t1, $a1 2706 mov $a0, 8*0($r_ptr) 2707 cmovnc $t2, $a2 2708 mov $a1, 8*1($r_ptr) 2709 cmovnc $t3, $a3 2710 mov $a2, 8*2($r_ptr) 2711 mov $a3, 8*3($r_ptr) 2712 2713 ret 2714.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 2715___ 2716 } 2717&gen_double("x"); 2718&gen_add("x"); 2719&gen_add_affine("x"); 2720} 2721}}} 2722 2723$code =~ s/\`([^\`]*)\`/eval $1/gem; 2724print $code; 2725close STDOUT; 2726