1656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project#!/usr/bin/env perl 2656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 3656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# ==================================================================== 4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# project. The module is, however, dual licensed under OpenSSL and 6656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# CRYPTOGAMS licenses depending on where you obtain it. For further 7656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# details see http://www.openssl.org/~appro/cryptogams/. 8656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# ==================================================================== 9656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 10656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# October 2005. 11656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# 12656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# Montgomery multiplication routine for x86_64. While it gives modest 13656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 14656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# than twice, >2x, as fast. Most common rsa1024 sign is improved by 15656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# respectful 50%. It remains to be seen if loop unrolling and 16656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# dedicated squaring routine can provide further improvement... 17656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# July 2011. 19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Add dedicated squaring procedure. Performance improvement varies 21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# from platform to platform, but in average it's ~5%/15%/25%/33% 22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# August 2011. 25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Unroll and modulo-schedule inner loops in such manner that they 27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# are "fallen through" for input lengths of 8, which is critical for 28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 1024-bit RSA *sign*. Average performance improvement in comparison 29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# to *initial* version of this module from 2005 is ~0%/30%/40%/45% 30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 32221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$flavour = shift; 33221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$output = shift; 34221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 35221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 36221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 37656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 38656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 39656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 40656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 41656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Projectdie "can't locate x86_64-xlate.pl"; 42656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 43221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromopen STDOUT,"| $^X $xlate $flavour $output"; 44656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 45656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# int bn_mul_mont( 46656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$rp="%rdi"; # BN_ULONG *rp, 47656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$ap="%rsi"; # const BN_ULONG *ap, 48656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$bp="%rdx"; # const BN_ULONG *bp, 49656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$np="%rcx"; # const BN_ULONG *np, 50656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$n0="%r8"; # const BN_ULONG *n0, 51656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$num="%r9"; # int num); 52656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$lo0="%r10"; 53656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$hi0="%r11"; 54656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$hi1="%r13"; 55656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$i="%r14"; 56656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$j="%r15"; 57656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$m0="%rbx"; 58656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$m1="%rbp"; 59656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 60656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$code=<<___; 61656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.text 62656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 63656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.globl bn_mul_mont 64656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.type bn_mul_mont,\@function,6 65656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.align 16 66656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Projectbn_mul_mont: 67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom test \$3,${num}d 68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lmul_enter 69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp \$8,${num}d 70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jb .Lmul_enter 71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $ap,$bp 72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jne .Lmul4x_enter 73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_enter 74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_enter: 77656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project push %rbx 78656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project push %rbp 79656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project push %r12 80656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project push %r13 81656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project push %r14 82656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project push %r15 83656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 84656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov ${num}d,${num}d 85221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lea 2($num),%r10 86221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %rsp,%r11 87221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom neg %r10 88221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) 89656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project and \$-1024,%rsp # minimize TLB usage 90656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 91221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp 92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_body: 93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $bp,%r12 # reassign $bp 94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $bp="%r12"; 96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 97656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov ($n0),$n0 # pull n0[0] value 98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($bp),$m0 # m0=bp[0] 99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax 100656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 101656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project xor $i,$i # i=0 102656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project xor $j,$j # j=0 103656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 105656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mulq $m0 # ap[0]*bp[0] 106656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rax,$lo0 107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np),%rax 108656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $lo0,$m1 # "tp[0]"*n0 110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi0 111656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[0]*m1 113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$lo0 # discarded 114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap),%rax 115656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 116656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rdx,$hi1 117656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 118656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project lea 1($j),$j # j++ 119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .L1st_enter 120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 122656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.L1st: 123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi1 124656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov ($ap,$j,8),%rax 125656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $lo0,$hi0 128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-16(%rsp,$j,8) # tp[j-1] 130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi1 131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st_enter: 133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi0 135656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov ($np,$j,8),%rax 136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($j),$j # j++ 138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$lo0 139656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 140656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mulq $m1 # np[j]*m1 141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$j 142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jne .L1st 143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi1 145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax # ap[0] 146656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 148656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-16(%rsp,$j,8) # tp[j-1] 150656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rdx,$hi1 151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $lo0,$hi0 152656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 153656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project xor %rdx,%rdx 154656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project add $hi0,$hi1 155656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 156656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov $hi1,-8(%rsp,$num,8) 157656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rdx,(%rsp,$num,8) # store upmost overflow bit 158656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 159656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project lea 1($i),$i # i++ 160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Louter 161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 162656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.Louter: 163656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov ($bp,$i,8),$m0 # m0=bp[i] 164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $j,$j # j=0 165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp),$lo0 167656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mulq $m0 # ap[0]*bp[i] 168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$lo0 # ap[0]*bp[i]+tp[0] 169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np),%rax 170656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 171656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $lo0,$m1 # tp[0]*n0 173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi0 174656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[0]*m1 176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$lo0 # discarded 177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap),%rax 178656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsp),$lo0 # tp[1] 180656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rdx,$hi1 181656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 182656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project lea 1($j),$j # j++ 183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Linner_enter 184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 186656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.Linner: 187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi1 188656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov ($ap,$j,8),%rax 189656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp,$j,8),$lo0 192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-16(%rsp,$j,8) # tp[j-1] 194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi1 195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner_enter: 197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi0 199656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov ($np,$j,8),%rax 200656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 202656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rdx,$hi0 203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$hi0 204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($j),$j # j++ 205656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 206656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mulq $m1 # np[j]*m1 207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$j 208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jne .Linner 209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi1 211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax # ap[0] 212656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 214656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov (%rsp,$j,8),$lo0 215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-16(%rsp,$j,8) # tp[j-1] 217656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rdx,$hi1 218656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 219656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project xor %rdx,%rdx 220656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project add $hi0,$hi1 221656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 222656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project add $lo0,$hi1 # pull upmost overflow bit 223656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project adc \$0,%rdx 224656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov $hi1,-8(%rsp,$num,8) 225656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rdx,(%rsp,$num,8) # store upmost overflow bit 226656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 227656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project lea 1($i),$i # i++ 228656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project cmp $num,$i 229656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project jl .Louter 230656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 231656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project xor $i,$i # i=0 and clear CF! 232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp),%rax # tp[0] 233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsp),$ap # borrow ap for tp 234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $num,$j # j=num 235656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project jmp .Lsub 236656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.align 16 237656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.Lsub: sbb ($np,$i,8),%rax 238656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 239656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov 8($ap,$i,8),%rax # tp[i+1] 240656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project lea 1($i),$i # i++ 241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dec $j # doesnn't affect CF! 242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lsub 243656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 244656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project sbb \$0,%rax # handle upmost overflow bit 245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i 246656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project and %rax,$ap 247656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project not %rax 248656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov $rp,$np 249656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project and %rax,$np 250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $num,$j # j=num 251656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project or $np,$ap # ap=borrow?tp:rp 252656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.align 16 253656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.Lcopy: # copy or in-place refresh 254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap,$i,8),%rax 255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $i,(%rsp,$i,8) # zap temporary vector 256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,($rp,$i,8) # rp[i]=tp[i] 257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i 258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub \$1,$j 259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lcopy 260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsp,$num,8),%rsi # restore %rsp 262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov \$1,%rax 263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsi),%r15 264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsi),%r14 265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16(%rsi),%r13 266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24(%rsi),%r12 267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32(%rsi),%rbp 268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40(%rsi),%rbx 269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 48(%rsi),%rsp 270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_epilogue: 271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ret 272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size bn_mul_mont,.-bn_mul_mont 273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{{{ 275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @A=("%r10","%r11"); 276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @N=("%r13","%rdi"); 277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type bn_mul4x_mont,\@function,6 279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul4x_mont: 281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_enter: 282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbx 283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbp 284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r12 285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r13 286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r14 287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r15 288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ${num}d,${num}d 290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($num),%r10 291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rsp,%r11 292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg %r10 293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) 294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$-1024,%rsp # minimize TLB usage 295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp 297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_body: 298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,%r12 # reassign $bp 300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $bp="%r12"; 302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($n0),$n0 # pull n0[0] value 304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($bp),$m0 # m0=bp[0] 305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax 306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $j,$j # j=0 309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[0]*bp[0] 312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$A[0] 313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np),%rax 314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $A[0],$m1 # "tp[0]"*n0 316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[0]*m1 319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] # discarded 320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap),%rax 321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($np),%rax 327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($ap),%rax 333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] 335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($j),$j # j++ 336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],(%rsp) 338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .L1st4x 340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st4x: 342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($np,$j,8),%rax 345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($ap,$j,8),%rax 351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-24(%rsp,$j,8) # tp[j-1] 355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($np,$j,8),%rax 360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 365656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov ($ap,$j,8),%rax 366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-16(%rsp,$j,8) # tp[j-1] 370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np,$j,8),%rax 375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap,$j,8),%rax 381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-8(%rsp,$j,8) # tp[j-1] 385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($np,$j,8),%rax 390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($j),$j # j++ 392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($ap,$j,8),%rax 397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-32(%rsp,$j,8) # tp[j-1] 401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$j 403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jl .L1st4x 404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($np,$j,8),%rax 408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($ap,$j,8),%rax 414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-24(%rsp,$j,8) # tp[j-1] 418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($np,$j,8),%rax 423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax # ap[0] 429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-16(%rsp,$j,8) # tp[j-1] 433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $N[1],$N[1] 436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$N[1] 438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-8(%rsp,$j,8) 439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],(%rsp,$j,8) # store upmost overflow bit 440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i # i++ 442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 4 443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter4x: 444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($bp,$i,8),$m0 # m0=bp[i] 445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $j,$j # j=0 446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp),$A[0] 447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[0]*bp[i] 449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] # ap[0]*bp[i]+tp[0] 450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np),%rax 451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $A[0],$m1 # tp[0]*n0 454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[0]*m1 457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] # "$N[0]", discarded 458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap),%rax 459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($np),%rax 465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8(%rsp),$A[1] # +tp[1] 467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($ap),%rax 473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($j),$j # j+=2 476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],(%rsp) # tp[j-1] 478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Linner4x 480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner4x: 482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($np,$j,8),%rax 485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($ap,$j,8),%rax 493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-24(%rsp,$j,8) # tp[j-1] 497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($np,$j,8),%rax 502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -8(%rsp,$j,8),$A[1] 504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap,$j,8),%rax 510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] 512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-16(%rsp,$j,8) # tp[j-1] 514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np,$j,8),%rax 519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap,$j,8),%rax 527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-8(%rsp,$j,8) # tp[j-1] 531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($np,$j,8),%rax 536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8(%rsp,$j,8),$A[1] 538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($j),$j # j++ 540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($ap,$j,8),%rax 545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] 547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-32(%rsp,$j,8) # tp[j-1] 549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$j 551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jl .Linner4x 552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($np,$j,8),%rax 556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($ap,$j,8),%rax 564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-24(%rsp,$j,8) # tp[j-1] 568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($np,$j,8),%rax 573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -8(%rsp,$j,8),$A[1] 575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i # i++ 577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax # ap[0] 582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] 584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-16(%rsp,$j,8) # tp[j-1] 586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $N[1],$N[1] 589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$N[1] 591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add (%rsp,$num,8),$N[0] # pull upmost overflow bit 592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$N[1] 593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-8(%rsp,$j,8) 594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],(%rsp,$j,8) # store upmost overflow bit 595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$i 597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jl .Louter4x 598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ 600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @ri=("%rax","%rdx",$m0,$m1); 601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16(%rsp,$num,8),$rp # restore $rp 603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0(%rsp),@ri[0] # tp[0] 604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pxor %xmm0,%xmm0 605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsp),@ri[1] # tp[1] 606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$2,$num # num/=4 607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsp),$ap # borrow ap for tp 608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 and clear CF! 609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub 0($np),@ri[0] 611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($ap),@ri[2] # tp[2] 612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24($ap),@ri[3] # tp[3] 613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 8($np),@ri[1] 614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -1($num),$j # j=num/4-1 615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsub4x 616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsub4x: 618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 16($np,$i,8),@ri[2] 621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32($ap,$i,8),@ri[0] # tp[i+1] 622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40($ap,$i,8),@ri[1] 623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 24($np,$i,8),@ri[3] 624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 32($np,$i,8),@ri[0] 627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 48($ap,$i,8),@ri[2] 628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 56($ap,$i,8),@ri[3] 629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 40($np,$i,8),@ri[1] 630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($i),$i # i++ 631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dec $j # doesnn't affect CF! 632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lsub4x 633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32($ap,$i,8),@ri[0] # load overflow bit 636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 16($np,$i,8),@ri[2] 637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 24($np,$i,8),@ri[3] 639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb \$0,@ri[0] # handle upmost overflow bit 642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and @ri[0],$ap 645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom not @ri[0] 646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $rp,$np 647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and @ri[0],$np 648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -1($num),$j 649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $np,$ap # ap=borrow?tp:rp 650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu ($ap),%xmm1 652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,(%rsp) 653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm1,($rp) 654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lcopy4x 655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy4x: # copy or in-place refresh 657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 16($ap,$i),%xmm2 658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 32($ap,$i),%xmm1 659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,16(%rsp,$i) 660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm2,16($rp,$i) 661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,32(%rsp,$i) 662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm1,32($rp,$i) 663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 32($i),$i 664656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project dec $j 665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lcopy4x 666656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shl \$2,$num 668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 16($ap,$i),%xmm2 669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,16(%rsp,$i) 670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm2,16($rp,$i) 671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 674221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 8(%rsp,$num,8),%rsi # restore %rsp 675656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project mov \$1,%rax 676221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov (%rsi),%r15 677221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 8(%rsi),%r14 678221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 16(%rsi),%r13 679221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 24(%rsi),%r12 680221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 32(%rsi),%rbp 681221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 40(%rsi),%rbx 682221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lea 48(%rsi),%rsp 683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_epilogue: 684221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ret 685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size bn_mul4x_mont,.-bn_mul4x_mont 686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}}} 688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{{{ 689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom###################################################################### 690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void bn_sqr4x_mont( 691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $rptr="%rdi"; # const BN_ULONG *rptr, 692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $aptr="%rsi"; # const BN_ULONG *aptr, 693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $bptr="%rdx"; # not used 694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $nptr="%rcx"; # const BN_ULONG *nptr, 695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $n0 ="%r8"; # const BN_ULONG *n0); 696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $num ="%r9"; # int num, has to be divisible by 4 and 697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # not less than 8 698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @A0=("%r10","%r11"); 701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @A1=("%r12","%r13"); 702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type bn_sqr4x_mont,\@function,6 706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_sqr4x_mont: 708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_enter: 709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbx 710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbp 711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r12 712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r13 713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r14 714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r15 715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shl \$3,${num}d # convert $num to bytes 717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor %r10,%r10 718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rsp,%r11 # put aside %rsp 719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub $num,%r10 # -$num 720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($n0),$n0 # *n0 721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) 722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$-1024,%rsp # minimize TLB usage 723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ############################################################## 724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # Stack layout 725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # 726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # +0 saved $num, used in reduction section 727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # +8 &t[2*$num], used in reduction section 728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # +32 saved $rptr 729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # +40 saved $nptr 730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # +48 saved *n0 731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # +56 saved %rsp 732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # +64 t[2*$num] 733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # 734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $rptr,32(%rsp) # save $rptr 735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $nptr,40(%rsp) 736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0, 48(%rsp) 737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r11, 56(%rsp) # save original %rsp 738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_body: 739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ############################################################## 740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # Squaring part: 741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # 742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # a) multiply-n-add everything but a[i]*a[i]; 743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # b) shift result of a) by 1 to the left and accumulate 744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # a[i]*a[i] products; 745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # 746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 32(%r10),$i # $i=-($num-32) 747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $num,$j # $j=$num 750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # comments apply to $num==8 case 752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -32($aptr,$i),$a0 # a[0] 753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -24($aptr,$i),%rax # a[1] 755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($aptr,$i),$ai # a[2] 757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$a1 758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[1]*a[0] 760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$A0[0] # a[1]*a[0] 761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax # a[2] 762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A0[1] 763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],-24($tptr,$i) # t[1] 764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[2]*a[0] 767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] 768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],-16($tptr,$i) # t[2] 771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -16($i),$j # j=-16 773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($aptr,$j),$ai # a[3] 776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[2]*a[1] 777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$A1[0] # a[2]*a[1]+t[3] 778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A1[1] 780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[0],$A0[0] 783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 16($j),$j 784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[3]*a[0] 786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],-8($tptr,$j) # t[3] 790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_1st 791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_1st: 794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($aptr,$j),$ai # a[4] 795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[3]*a[1] 797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] # a[3]*a[1]+t[4] 798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[1],$A0[1] 803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[4]*a[0] 805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax # a[3] 807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],($tptr,$j) # t[4] 809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($aptr,$j),$ai # a[5] 812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[1],$A1[1] 813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[4]*a[3] 814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # a[4]*a[3]+t[5] 815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[1] 817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[0],$A0[0] 820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[5]*a[2] 822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],8($tptr,$j) # t[5] 826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($aptr,$j),$ai # a[6] 828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[5]*a[3] 830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] # a[5]*a[3]+t[6] 831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[1],$A0[1] 836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[6]*a[2] 838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax # a[3] 840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],16($tptr,$j) # t[6] 842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24($aptr,$j),$ai # a[7] 845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[1],$A1[1] 846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[6]*a[5] 847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # a[6]*a[5]+t[7] 848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[1] 850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[0],$A0[0] 853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 32($j),$j 854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[7]*a[4] 856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],-8($tptr,$j) # t[7] 860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp \$0,$j 862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jne .Lsqr4x_1st 863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[1],$A1[1] 866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[0] 867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[7]*a[5] 868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] 869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[1],($tptr) # t[8] 872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 16($i),$i 873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[0],8($tptr) # t[9] 874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_outer 875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_outer: # comments apply to $num==6 case 878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -32($aptr,$i),$a0 # a[0] 879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -24($aptr,$i),%rax # a[1] 881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($aptr,$i),$ai # a[2] 883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$a1 884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -24($tptr,$i),$A0[0] # t[1] 886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[1]*a[0] 888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # a[1]*a[0]+t[1] 889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax # a[2] 890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],-24($tptr,$i) # t[1] 892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[2]*a[0] 897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] 898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],-16($tptr,$i) # t[2] 901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -16($i),$j # j=-16 903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($aptr,$j),$ai # a[3] 907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[1],$A1[1] 908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8($tptr,$j),$A1[0] 909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[1] 910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[2]*a[1] 911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # a[2]*a[1]+t[3] 912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[1] 914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[0],$A0[0] 917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[3]*a[0] 919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],8($tptr,$j) # t[3] 923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 16($j),$j 925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_inner 926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_inner: 929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($aptr,$j),$ai # a[4] 930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add ($tptr,$j),$A1[1] 932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[0] 933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[3]*a[1] 934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] # a[3]*a[1]+t[4] 935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[1],$A0[1] 940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[4]*a[0] 942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax # a[3] 944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],($tptr,$j) # t[4] 946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($aptr,$j),$ai # a[5] 948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[1],$A1[1] 949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8($tptr,$j),$A1[0] 950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[1] 951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[4]*a[3] 952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # a[4]*a[3]+t[5] 953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[1] 955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[0],$A0[0] 958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 16($j),$j # j++ 959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[5]*a[2] 961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp \$0,$j 967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jne .Lsqr4x_inner 968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[1],$A1[1] 971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[0] 972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[5]*a[3] 973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] 974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below 978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add \$16,$i 980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lsqr4x_outer 981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # comments apply to $num==4 case 983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -32($aptr),$a0 # a[0] 984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -24($aptr),%rax # a[1] 986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($aptr),$ai # a[2] 988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$a1 989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[1]*a[0] 992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax # a[2] 994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],-24($tptr) # t[1] 996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 1000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[2]*a[0] 1001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] 1002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 1003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 1004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],-16($tptr) # t[2] 1005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($aptr),$ai # a[3] 1007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[2]*a[1] 1008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 1010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 1011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 1013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[0],$A0[0] 1014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A1[1] 1015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 1016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a0 # a[3]*a[0] 1017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $ai,%rax 1019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 1020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[0],-8($tptr) # t[3] 1021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 1023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[1],$A1[1] 1024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[0] 1025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $a1 # a[3]*a[1] 1026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] 1027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($aptr),%rax # a[2] 1028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 1029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[1],($tptr) # t[4] 1031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[0],8($tptr) # t[5] 1032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $ai # a[2]*a[3] 1034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 1035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ 1036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($shift,$carry)=($a0,$a1); 1037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @S=(@A1,$ai,$n0); 1038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 1039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add \$16,$i 1040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $shift,$shift 1041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub $num,$i # $i=16-$num 1042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $carry,$carry 1043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A1[0],%rax # t[5] 1045392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 1046392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,8($tptr) # t[5] 1047392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,16($tptr) # t[6] 1048392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $carry,24($tptr) # t[7] 1049392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1050392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($aptr,$i),%rax # a[0] 1051392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 64(%rsp,$num,2),$tptr 1052392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] # t[0] 1053392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -24($tptr,$i,2),$A0[1] # t[1] 1054392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1055392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1056392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[0] 1057392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1058392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[1] 1059392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $A0[0],$S[1] # | t[2*i]>>63 1060392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1061392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],$shift # shift=t[2*i+1]>>63 1062392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul %rax # a[i]*a[i] 1063392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg $carry # mov $carry,cf 1064392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1065392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rax,$S[0] 1066392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($aptr,$i),%rax # a[i+1] # prefetch 1067392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[0],-32($tptr,$i,2) 1068392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$S[1] 1069392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1070392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1071392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[1],-24($tptr,$i,2) 1072392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb $carry,$carry # mov cf,$carry 1073392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[0] 1074392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1075392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[1] 1076392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $A0[0],$S[3] # | t[2*i]>>63 1077392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1078392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],$shift # shift=t[2*i+1]>>63 1079392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul %rax # a[i]*a[i] 1080392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg $carry # mov $carry,cf 1081392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1082392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rax,$S[2] 1083392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0($aptr,$i),%rax # a[i+1] # prefetch 1084392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[2],-16($tptr,$i,2) 1085392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$S[3] 1086392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 16($i),$i 1087392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[3],-40($tptr,$i,2) 1088392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb $carry,$carry # mov cf,$carry 1089392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_shift_n_add 1090392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1091392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 1092392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_shift_n_add: 1093392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1094392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[0] 1095392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1096392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[1] 1097392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $A0[0],$S[1] # | t[2*i]>>63 1098392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1099392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],$shift # shift=t[2*i+1]>>63 1100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul %rax # a[i]*a[i] 1101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg $carry # mov $carry,cf 1102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rax,$S[0] 1104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($aptr,$i),%rax # a[i+1] # prefetch 1105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[0],-32($tptr,$i,2) 1106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$S[1] 1107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[1],-24($tptr,$i,2) 1110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb $carry,$carry # mov cf,$carry 1111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[0] 1112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[1] 1114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $A0[0],$S[3] # | t[2*i]>>63 1115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],$shift # shift=t[2*i+1]>>63 1117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul %rax # a[i]*a[i] 1118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg $carry # mov $carry,cf 1119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rax,$S[2] 1121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0($aptr,$i),%rax # a[i+1] # prefetch 1122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[2],-16($tptr,$i,2) 1123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$S[3] 1124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[3],-8($tptr,$i,2) 1127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb $carry,$carry # mov cf,$carry 1128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[0] 1129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[1] 1131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $A0[0],$S[1] # | t[2*i]>>63 1132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],$shift # shift=t[2*i+1]>>63 1134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul %rax # a[i]*a[i] 1135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg $carry # mov $carry,cf 1136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rax,$S[0] 1138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($aptr,$i),%rax # a[i+1] # prefetch 1139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[0],0($tptr,$i,2) 1140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$S[1] 1141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[1],8($tptr,$i,2) 1144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb $carry,$carry # mov cf,$carry 1145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[0] 1146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[1] 1148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $A0[0],$S[3] # | t[2*i]>>63 1149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],$shift # shift=t[2*i+1]>>63 1151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul %rax # a[i]*a[i] 1152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg $carry # mov $carry,cf 1153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rax,$S[2] 1155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($aptr,$i),%rax # a[i+1] # prefetch 1156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[2],16($tptr,$i,2) 1157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$S[3] 1158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[3],24($tptr,$i,2) 1159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb $carry,$carry # mov cf,$carry 1160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add \$32,$i 1161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lsqr4x_shift_n_add 1162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[0] 1165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[1] 1167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $A0[0],$S[1] # | t[2*i]>>63 1168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A0[1],$shift # shift=t[2*i+1]>>63 1170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul %rax # a[i]*a[i] 1171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg $carry # mov $carry,cf 1172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rax,$S[0] 1174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($aptr),%rax # a[i+1] # prefetch 1175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[0],-32($tptr) 1176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$S[1] 1177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[1],-24($tptr) 1180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb $carry,$carry # mov cf,$carry 1181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[0] 1182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$63,$A0[1] 1184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $A0[0],$S[3] # | t[2*i]>>63 1185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul %rax # a[i]*a[i] 1186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg $carry # mov $carry,cf 1187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rax,$S[2] 1188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$S[3] 1189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[2],-16($tptr) 1190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $S[3],-8($tptr) 1191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 1192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 1193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom############################################################## 1194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Montgomery reduction part, "word-by-word" algorithm. 1195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 1196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ 1197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($topbit,$nptr)=("%rbp",$aptr); 1198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($m0,$m1)=($a0,$a1); 1199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @Ni=("%rbx","%r9"); 1200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 1201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40(%rsp),$nptr # restore $nptr 1202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 48(%rsp),$n0 # restore *n0 1203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $j,$j 1204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $num,0(%rsp) # save $num 1205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub $num,$j # $j=-$num 1206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 64(%rsp),$A0[0] # t[0] # modsched # 1207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m0 # # modsched # 1208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 64(%rsp,$num,2),%rax # end of t[] buffer 1209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 64(%rsp,$num),$tptr # end of t[] window 1210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,8(%rsp) # save end of t[] buffer 1211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($nptr,$num),$nptr # end of n[] buffer 1212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $topbit,$topbit # $topbit=0 1213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0($nptr,$j),%rax # n[0] # modsched # 1215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($nptr,$j),$Ni[1] # n[1] # modsched # 1216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # 1217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$Ni[0] # # modsched # 1218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_mont_outer 1219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 1221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_mont_outer: 1222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 1223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m0 # n[0]*m0 1224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # n[0]*m0+t[0] 1225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[1],%rax 1226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 1227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 1228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 1230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8($tptr,$j),$A0[1] 1231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 1232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m0 # n[1]*m0 1233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] # n[1]*m0+t[1] 1234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[0],%rax 1235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 1236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $A0[1],$m1 1238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($nptr,$j),$Ni[0] # n[2] 1240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[1],$A1[1] 1241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[1],$A1[0] 1242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[1] 1243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m1 # n[0]*m1 1244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # n[0]*m1+"t[1]" 1245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[0],%rax 1246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[1] 1247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[0],8($tptr,$j) # "t[1]" 1248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 1250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 16($tptr,$j),$A0[0] 1251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 1252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m0 # n[2]*m0 1253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # n[2]*m0+t[2] 1254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[1],%rax 1255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 1256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24($nptr,$j),$Ni[1] # n[3] 1258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 1259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[0],$A1[1] 1260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[0] 1261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m1 # n[1]*m1 1262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] # n[1]*m1+"t[2]" 1263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[1],%rax 1264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 1265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[1],16($tptr,$j) # "t[2]" 1266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 1268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 24($tptr,$j),$A0[1] 1269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 32($j),$j 1270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 1271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m0 # n[3]*m0 1272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] # n[3]*m0+t[3] 1273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[0],%rax 1274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 1275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_mont_inner 1276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 1278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_mont_inner: 1279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($nptr,$j),$Ni[0] # n[4] 1280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[1],$A1[1] 1281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[1],$A1[0] 1282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[1] 1283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m1 # n[2]*m1 1284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # n[2]*m1+"t[3]" 1285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[0],%rax 1286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[1] 1287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[0],-8($tptr,$j) # "t[3]" 1288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 1290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add ($tptr,$j),$A0[0] 1291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 1292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m0 # n[4]*m0 1293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # n[4]*m0+t[4] 1294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[1],%rax 1295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 1296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($nptr,$j),$Ni[1] # n[5] 1298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 1299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[0],$A1[1] 1300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[0] 1301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m1 # n[3]*m1 1302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] # n[3]*m1+"t[4]" 1303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[1],%rax 1304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 1305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[1],($tptr,$j) # "t[4]" 1306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 1308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8($tptr,$j),$A0[1] 1309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 1310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m0 # n[5]*m0 1311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] # n[5]*m0+t[5] 1312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[0],%rax 1313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 1314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($nptr,$j),$Ni[0] # n[6] 1317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[1],$A1[1] 1318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[1],$A1[0] 1319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[1] 1320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m1 # n[4]*m1 1321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # n[4]*m1+"t[5]" 1322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[0],%rax 1323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[1] 1324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[0],8($tptr,$j) # "t[5]" 1325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 1327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 16($tptr,$j),$A0[0] 1328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 1329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m0 # n[6]*m0 1330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[0] # n[6]*m0+t[6] 1331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[1],%rax 1332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[1] 1333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24($nptr,$j),$Ni[1] # n[7] 1335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 1336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[0],$A1[1] 1337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[0] 1338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m1 # n[5]*m1 1339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] # n[5]*m1+"t[6]" 1340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[1],%rax 1341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 1342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[1],16($tptr,$j) # "t[6]" 1343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[0],$A0[0] 1345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 24($tptr,$j),$A0[1] 1346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 32($j),$j 1347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[0] 1348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m0 # n[7]*m0 1349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A0[1] # n[7]*m0+t[7] 1350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[0],%rax 1351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A0[0] 1352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp \$0,$j 1353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jne .Lsqr4x_mont_inner 1354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub 0(%rsp),$j # $j=-$num # modsched # 1356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m0 # # modsched # 1357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[1],$A1[1] 1359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[1],$A1[0] 1360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[1] 1361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m1 # n[6]*m1 1362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[0] # n[6]*m1+"t[7]" 1363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[1],%rax 1364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[1] 1365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[0],-8($tptr) # "t[7]" 1366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A0[1],$A0[1] 1368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add ($tptr),$A0[0] # +t[8] 1369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 1370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0($nptr,$j),$Ni[0] # n[0] # modsched # 1371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $topbit,$A0[0] 1372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A0[1] 1373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # 1375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $A1[0],$A1[0] 1376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($nptr,$j),$Ni[1] # n[1] # modsched # 1377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[0],$A1[1] 1378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($tptr,$j),$A0[0] # t[0] # modsched # 1379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$A1[0] 1380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mul $m1 # n[7]*m1 1381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A1[1] # n[7]*m1+"t[8]" 1382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Ni[0],%rax # # modsched # 1383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc %rdx,$A1[0] 1384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[1],($tptr) # "t[8]" 1385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $topbit,$topbit 1387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8($tptr),$A1[0] # +t[9] 1388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc $topbit,$topbit 1389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A0[1],$A1[0] 1390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 16($tptr),$tptr # "t[$num]>>128" 1391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$topbit 1392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $A1[0],-8($tptr) # "t[9]" 1393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp 8(%rsp),$tptr # are we done? 1394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jb .Lsqr4x_mont_outer 1395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0(%rsp),$num # restore $num 1397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $topbit,($tptr) # save $topbit 1398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 1399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 1400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom############################################################## 1401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Post-condition, 4x unrolled copy from bn_mul_mont 1402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 1403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ 1404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($tptr,$nptr)=("%rbx",$aptr); 1405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @ri=("%rax","%rdx","%r10","%r11"); 1406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 1407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 64(%rsp,$num),@ri[0] # tp[0] 1408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result 1409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40(%rsp),$nptr # restore $nptr 1410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$5,$num # num/4 1411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($tptr),@ri[1] # t[1] 1412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 and clear CF! 1413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32(%rsp),$rptr # restore $rptr 1415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub 0($nptr),@ri[0] 1416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($tptr),@ri[2] # t[2] 1417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24($tptr),@ri[3] # t[3] 1418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 8($nptr),@ri[1] 1419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -1($num),$j # j=num/4-1 1420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_sub 1421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 1422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_sub: 1423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] 1424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] 1425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 16($nptr,$i,8),@ri[2] 1426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32($tptr,$i,8),@ri[0] # tp[i+1] 1427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40($tptr,$i,8),@ri[1] 1428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 24($nptr,$i,8),@ri[3] 1429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] 1430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] 1431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 32($nptr,$i,8),@ri[0] 1432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 48($tptr,$i,8),@ri[2] 1433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 56($tptr,$i,8),@ri[3] 1434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 40($nptr,$i,8),@ri[1] 1435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($i),$i # i++ 1436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dec $j # doesn't affect CF! 1437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lsqr4x_sub 1438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] 1440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32($tptr,$i,8),@ri[0] # load overflow bit 1441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 16($nptr,$i,8),@ri[2] 1442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] 1443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 24($nptr,$i,8),@ri[3] 1444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] 1445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb \$0,@ri[0] # handle upmost overflow bit 1447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] 1448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 1449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and @ri[0],$tptr 1450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom not @ri[0] 1451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $rptr,$nptr 1452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and @ri[0],$nptr 1453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -1($num),$j 1454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $nptr,$tptr # tp=borrow?tp:rp 1455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pxor %xmm0,%xmm0 1457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 64(%rsp,$num,8),$nptr 1458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu ($tptr),%xmm1 1459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($nptr,$num,8),$nptr 1460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,64(%rsp) # zap lower half of temporary vector 1461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,($nptr) # zap upper half of temporary vector 1462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm1,($rptr) 1463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsqr4x_copy 1464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 1465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_copy: # copy or in-place refresh 1466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 16($tptr,$i),%xmm2 1467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 32($tptr,$i),%xmm1 1468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector 1469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector 1470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector 1471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector 1472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm2,16($rptr,$i) 1473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm1,32($rptr,$i) 1474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 32($i),$i 1475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dec $j 1476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lsqr4x_copy 1477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 16($tptr,$i),%xmm2 1479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector 1480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector 1481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm2,16($rptr,$i) 1482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 1483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 1484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 1485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 56(%rsp),%rsi # restore %rsp 1486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov \$1,%rax 1487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0(%rsi),%r15 1488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsi),%r14 1489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16(%rsi),%r13 1490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24(%rsi),%r12 1491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32(%rsi),%rbp 1492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40(%rsi),%rbx 1493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 48(%rsi),%rsp 1494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_epilogue: 1495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ret 1496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size bn_sqr4x_mont,.-bn_sqr4x_mont 1497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 1498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}}} 1499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 1500221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1501221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 16 1502221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 1503221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1504221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1505221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1506221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromif ($win64) { 1507221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$rec="%rcx"; 1508221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$frame="%rdx"; 1509221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$context="%r8"; 1510221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$disp="%r9"; 1511221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1512221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___; 1513221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.extern __imp_RtlVirtualUnwind 1514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type mul_handler,\@abi-omnipotent 1515221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 16 1516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommul_handler: 1517221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom push %rsi 1518221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom push %rdi 1519221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom push %rbx 1520221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom push %rbp 1521221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom push %r12 1522221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom push %r13 1523221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom push %r14 1524221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom push %r15 1525221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom pushfq 1526221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom sub \$64,%rsp 1527221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1528221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 120($context),%rax # pull context->Rax 1529221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 248($context),%rbx # pull context->Rip 1530221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($disp),%rsi # disp->ImageBase 1532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 56($disp),%r11 # disp->HandlerData 1533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0(%r11),%r10d # HandlerData[0] 1535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsi,%r10),%r10 # end of prologue label 1536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp %r10,%rbx # context->Rip<end of prologue label 1537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jb .Lcommon_seh_tail 1538221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1539221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 152($context),%rax # pull context->Rsp 1540221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 4(%r11),%r10d # HandlerData[1] 1542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsi,%r10),%r10 # epilogue label 1543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp %r10,%rbx # context->Rip>=epilogue label 1544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jae .Lcommon_seh_tail 1545221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1546221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 192($context),%r10 # pull $num 1547221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 8(%rax,%r10,8),%rax # pull saved stack pointer 1548221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lea 48(%rax),%rax 1549221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1550221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov -8(%rax),%rbx 1551221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov -16(%rax),%rbp 1552221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov -24(%rax),%r12 1553221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov -32(%rax),%r13 1554221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov -40(%rax),%r14 1555221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov -48(%rax),%r15 1556221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %rbx,144($context) # restore context->Rbx 1557221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %rbp,160($context) # restore context->Rbp 1558221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %r12,216($context) # restore context->R12 1559221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %r13,224($context) # restore context->R13 1560221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %r14,232($context) # restore context->R14 1561221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %r15,240($context) # restore context->R15 1562221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lcommon_seh_tail 1564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size mul_handler,.-mul_handler 1565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type sqr_handler,\@abi-omnipotent 1567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 1568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsqr_handler: 1569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rsi 1570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rdi 1571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbx 1572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbp 1573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r12 1574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r13 1575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r14 1576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r15 1577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pushfq 1578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub \$64,%rsp 1579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 120($context),%rax # pull context->Rax 1581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 248($context),%rbx # pull context->Rip 1582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea .Lsqr4x_body(%rip),%r10 1584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp %r10,%rbx # context->Rip<.Lsqr_body 1585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jb .Lcommon_seh_tail 1586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 152($context),%rax # pull context->Rsp 1588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea .Lsqr4x_epilogue(%rip),%r10 1590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue 1591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jae .Lcommon_seh_tail 1592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 56(%rax),%rax # pull saved stack pointer 1594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 48(%rax),%rax 1595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8(%rax),%rbx 1597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16(%rax),%rbp 1598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -24(%rax),%r12 1599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -32(%rax),%r13 1600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -40(%rax),%r14 1601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -48(%rax),%r15 1602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rbx,144($context) # restore context->Rbx 1603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rbp,160($context) # restore context->Rbp 1604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r12,216($context) # restore context->R12 1605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r13,224($context) # restore context->R13 1606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r14,232($context) # restore context->R14 1607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r15,240($context) # restore context->R15 1608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcommon_seh_tail: 1610221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 8(%rax),%rdi 1611221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 16(%rax),%rsi 1612221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %rax,152($context) # restore context->Rsp 1613221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %rsi,168($context) # restore context->Rsi 1614221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %rdi,176($context) # restore context->Rdi 1615221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1616221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 40($disp),%rdi # disp->ContextRecord 1617221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov $context,%rsi # context 1618221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov \$154,%ecx # sizeof(CONTEXT) 1619221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .long 0xa548f3fc # cld; rep movsq 1620221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1621221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov $disp,%rsi 1622221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1623221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 8(%rsi),%rdx # arg2, disp->ImageBase 1624221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 0(%rsi),%r8 # arg3, disp->ControlPc 1625221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1626221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov 40(%rsi),%r10 # disp->ContextRecord 1627221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lea 56(%rsi),%r11 # &disp->HandlerData 1628221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lea 24(%rsi),%r12 # &disp->EstablisherFrame 1629221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %r10,32(%rsp) # arg5 1630221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %r11,40(%rsp) # arg6 1631221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %r12,48(%rsp) # arg7 1632221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov %rcx,56(%rsp) # arg8, (NULL) 1633221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom call *__imp_RtlVirtualUnwind(%rip) 1634221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1635221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov \$1,%eax # ExceptionContinueSearch 1636221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom add \$64,%rsp 1637221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom popfq 1638656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project pop %r15 1639656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project pop %r14 1640656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project pop %r13 1641656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project pop %r12 1642656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project pop %rbp 1643656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project pop %rbx 1644221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom pop %rdi 1645221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom pop %rsi 1646656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project ret 1647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size sqr_handler,.-sqr_handler 1648221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1649221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.section .pdata 1650221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 4 1651221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .rva .LSEH_begin_bn_mul_mont 1652221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .rva .LSEH_end_bn_mul_mont 1653221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .rva .LSEH_info_bn_mul_mont 1654221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 1655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_begin_bn_mul4x_mont 1656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_end_bn_mul4x_mont 1657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_info_bn_mul4x_mont 1658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_begin_bn_sqr4x_mont 1660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_end_bn_sqr4x_mont 1661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_info_bn_sqr4x_mont 1662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1663221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.section .xdata 1664221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 8 1665221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.LSEH_info_bn_mul_mont: 1666221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .byte 9,0,0,0 1667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva mul_handler 1668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 1669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_bn_mul4x_mont: 1670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 9,0,0,0 1671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva mul_handler 1672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_bn_sqr4x_mont: 1674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 9,0,0,0 1675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva sqr_handler 1676656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project___ 1677221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} 1678656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project 1679656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Projectprint $code; 1680656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Projectclose STDOUT; 1681