1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl 2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and 6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further 7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/. 8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# August 2011. 11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Companion to x86_64-mont.pl that optimizes cache-timing attack 13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# countermeasures. The subroutines are produced by replacing bp[i] 14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# references in their x86_64-mont.pl counterparts with cache-neutral 15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# references to powers table computed in BN_mod_exp_mont_consttime. 16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# In addition subroutine that scatters elements of the powers table 17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# is implemented, so that scatter-/gathering can be tuned without 18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# bn_exp.c modifications. 19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$flavour = shift; 21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$output = shift; 22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromdie "can't locate x86_64-xlate.pl"; 30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 3104ef91b390dfcc6125913e2f2af502d23d7a5112Brian Carlstromopen OUT,"| \"$^X\" $xlate $flavour $output"; 3204ef91b390dfcc6125913e2f2af502d23d7a5112Brian Carlstrom*STDOUT=*OUT; 33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# int bn_mul_mont_gather5( 35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rp="%rdi"; # BN_ULONG *rp, 36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ap="%rsi"; # const BN_ULONG *ap, 37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$bp="%rdx"; # const BN_ULONG *bp, 38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$np="%rcx"; # const BN_ULONG *np, 39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$n0="%r8"; # const BN_ULONG *n0, 40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$num="%r9"; # int num, 41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # int idx); # 0 to 2^5-1, "index" in $bp holding 42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # pre-computed powers of a', interlaced 43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # in such manner that b[0] is $bp[idx], 44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # b[1] is [2^5+idx], etc. 45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$lo0="%r10"; 46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$hi0="%r11"; 47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$hi1="%r13"; 48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$i="%r14"; 49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$j="%r15"; 50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$m0="%rbx"; 51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$m1="%rbp"; 52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code=<<___; 54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.text 55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl bn_mul_mont_gather5 57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type bn_mul_mont_gather5,\@function,6 58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 64 59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul_mont_gather5: 60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom test \$3,${num}d 61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lmul_enter 62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp \$8,${num}d 63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jb .Lmul_enter 64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lmul4x_enter 65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_enter: 68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ${num}d,${num}d 69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbx 71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbp 72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r12 73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r13 74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r14 75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r15 76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64); 78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -0x28(%rsp),%rsp 79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps %xmm6,(%rsp) 80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps %xmm7,0x10(%rsp) 81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_alloca: 82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rsp,%rax 85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 2($num),%r11 86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg %r11 87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) 88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$-1024,%rsp # minimize TLB usage 89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_body: 92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $bp,%r12 # reassign $bp 93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $bp="%r12"; 95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $STRIDE=2**5*8; # 5 is "window size" 96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $N=$STRIDE/4; # should match cache line size 97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r10,%r11 99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$`log($N/8)/log(2)`,%r10 100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$`$N/8-1`,%r11 101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom not %r10 102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea .Lmagic_masks(%rip),%rax 103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 96($bp,%r11,8),$bp # pointer within 1st cache line 105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 8(%rax,%r10,8),%xmm5 # cache line contains element 107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 24(%rax,%r10,8),%xmm7 109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `0*$STRIDE/4-96`($bp),%xmm0 111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `1*$STRIDE/4-96`($bp),%xmm1 112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm4,%xmm0 113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `2*$STRIDE/4-96`($bp),%xmm2 114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm5,%xmm1 115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `3*$STRIDE/4-96`($bp),%xmm3 116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm6,%xmm2 117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm1,%xmm0 118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm7,%xmm3 119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm2,%xmm0 120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea $STRIDE($bp),$bp 121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm3,%xmm0 122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq %xmm0,$m0 # m0=bp[0] 124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($n0),$n0 # pull n0[0] value 126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax 127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $j,$j # j=0 130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `0*$STRIDE/4-96`($bp),%xmm0 132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `1*$STRIDE/4-96`($bp),%xmm1 133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm4,%xmm0 134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `2*$STRIDE/4-96`($bp),%xmm2 135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm5,%xmm1 136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[0]*bp[0] 139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$lo0 140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np),%rax 141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `3*$STRIDE/4-96`($bp),%xmm3 143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm6,%xmm2 144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm1,%xmm0 145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm7,%xmm3 146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $lo0,$m1 # "tp[0]"*n0 148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi0 149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm2,%xmm0 151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea $STRIDE($bp),$bp 152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm3,%xmm0 153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[0]*m1 155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$lo0 # discarded 156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap),%rax 157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi1 159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($j),$j # j++ 161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .L1st_enter 162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st: 165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi1 166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap,$j,8),%rax 167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $lo0,$hi0 170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-16(%rsp,$j,8) # tp[j-1] 172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi1 173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st_enter: 175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi0 177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np,$j,8),%rax 178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($j),$j # j++ 180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$lo0 181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$j 184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jne .L1st 185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq %xmm0,$m0 # bp[1] 187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi1 189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax # ap[0] 190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-16(%rsp,$j,8) # tp[j-1] 194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi1 195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $lo0,$hi0 196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor %rdx,%rdx 198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$hi1 199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-8(%rsp,$num,8) 201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,(%rsp,$num,8) # store upmost overflow bit 202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i # i++ 204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Louter 205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter: 207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $j,$j # j=0 208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp),$lo0 210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `0*$STRIDE/4-96`($bp),%xmm0 212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `1*$STRIDE/4-96`($bp),%xmm1 213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm4,%xmm0 214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `2*$STRIDE/4-96`($bp),%xmm2 215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm5,%xmm1 216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[0]*bp[i] 218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$lo0 # ap[0]*bp[i]+tp[0] 219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np),%rax 220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `3*$STRIDE/4-96`($bp),%xmm3 223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm6,%xmm2 224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm1,%xmm0 225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm7,%xmm3 226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $lo0,$m1 # tp[0]*n0 228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi0 229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm2,%xmm0 231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea $STRIDE($bp),$bp 232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm3,%xmm0 233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[0]*m1 235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$lo0 # discarded 236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap),%rax 237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsp),$lo0 # tp[1] 239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi1 240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($j),$j # j++ 242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Linner_enter 243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner: 246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi1 247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap,$j,8),%rax 248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp,$j,8),$lo0 251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-16(%rsp,$j,8) # tp[j-1] 253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi1 254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner_enter: 256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi0 258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np,$j,8),%rax 259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi0 262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$hi0 263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($j),$j # j++ 264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$j 267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jne .Linner 268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq %xmm0,$m0 # bp[i+1] 270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$hi1 272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax # ap[0] 273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp,$j,8),$lo0 276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-16(%rsp,$j,8) # tp[j-1] 278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$hi1 279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor %rdx,%rdx 281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$hi1 282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $lo0,$hi1 # pull upmost overflow bit 284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $hi1,-8(%rsp,$num,8) 286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,(%rsp,$num,8) # store upmost overflow bit 287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i # i++ 289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$i 290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jl .Louter 291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 and clear CF! 293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp),%rax # tp[0] 294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsp),$ap # borrow ap for tp 295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $num,$j # j=num 296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsub 297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsub: sbb ($np,$i,8),%rax 299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap,$i,8),%rax # tp[i+1] 301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i # i++ 302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dec $j # doesnn't affect CF! 303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lsub 304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb \$0,%rax # handle upmost overflow bit 306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i 307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and %rax,$ap 308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom not %rax 309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $rp,$np 310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and %rax,$np 311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $num,$j # j=num 312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $np,$ap # ap=borrow?tp:rp 313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy: # copy or in-place refresh 315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap,$i,8),%rax 316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $i,(%rsp,$i,8) # zap temporary vector 317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,($rp,$i,8) # rp[i]=tp[i] 318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i 319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub \$1,$j 320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lcopy 321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsp,$num,8),%rsi # restore %rsp 323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov \$1,%rax 324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64); 326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps (%rsi),%xmm6 327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps 0x10(%rsi),%xmm7 328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 0x28(%rsi),%rsi 329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsi),%r15 332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsi),%r14 333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16(%rsi),%r13 334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24(%rsi),%r12 335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32(%rsi),%rbp 336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40(%rsi),%rbx 337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 48(%rsi),%rsp 338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_epilogue: 339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ret 340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{{{ 343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @A=("%r10","%r11"); 344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @N=("%r13","%rdi"); 345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type bn_mul4x_mont_gather5,\@function,6 347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul4x_mont_gather5: 349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_enter: 350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ${num}d,${num}d 351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbx 353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbp 354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r12 355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r13 356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r14 357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r15 358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64); 360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -0x28(%rsp),%rsp 361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps %xmm6,(%rsp) 362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps %xmm7,0x10(%rsp) 363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_alloca: 364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rsp,%rax 367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($num),%r11 368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom neg %r11 369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) 370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$-1024,%rsp # minimize TLB usage 371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_body: 374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,%r12 # reassign $bp 376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $bp="%r12"; 378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $STRIDE=2**5*8; # 5 is "window size" 379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $N=$STRIDE/4; # should match cache line size 380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r10,%r11 382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$`log($N/8)/log(2)`,%r10 383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$`$N/8-1`,%r11 384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom not %r10 385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea .Lmagic_masks(%rip),%rax 386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 96($bp,%r11,8),$bp # pointer within 1st cache line 388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 8(%rax,%r10,8),%xmm5 # cache line contains element 390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 24(%rax,%r10,8),%xmm7 392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `0*$STRIDE/4-96`($bp),%xmm0 394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `1*$STRIDE/4-96`($bp),%xmm1 395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm4,%xmm0 396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `2*$STRIDE/4-96`($bp),%xmm2 397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm5,%xmm1 398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `3*$STRIDE/4-96`($bp),%xmm3 399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm6,%xmm2 400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm1,%xmm0 401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm7,%xmm3 402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm2,%xmm0 403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea $STRIDE($bp),$bp 404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm3,%xmm0 405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq %xmm0,$m0 # m0=bp[0] 407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($n0),$n0 # pull n0[0] value 408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax 409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $j,$j # j=0 412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `0*$STRIDE/4-96`($bp),%xmm0 414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `1*$STRIDE/4-96`($bp),%xmm1 415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm4,%xmm0 416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `2*$STRIDE/4-96`($bp),%xmm2 417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm5,%xmm1 418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[0]*bp[0] 421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,$A[0] 422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np),%rax 423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `3*$STRIDE/4-96`($bp),%xmm3 425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm6,%xmm2 426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm1,%xmm0 427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm7,%xmm3 428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $A[0],$m1 # "tp[0]"*n0 430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm2,%xmm0 433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea $STRIDE($bp),$bp 434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm3,%xmm0 435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[0]*m1 437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] # discarded 438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap),%rax 439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($np),%rax 445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($ap),%rax 451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] 453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($j),$j # j++ 454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],(%rsp) 456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .L1st4x 458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st4x: 460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($np,$j,8),%rax 463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($ap,$j,8),%rax 469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-24(%rsp,$j,8) # tp[j-1] 473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($np,$j,8),%rax 478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap,$j,8),%rax 484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-16(%rsp,$j,8) # tp[j-1] 488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np,$j,8),%rax 493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap,$j,8),%rax 499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-8(%rsp,$j,8) # tp[j-1] 503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($np,$j,8),%rax 508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($j),$j # j++ 510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($ap,$j,8),%rax 515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-32(%rsp,$j,8) # tp[j-1] 519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$j 521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jl .L1st4x 522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($np,$j,8),%rax 526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($ap,$j,8),%rax 532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-24(%rsp,$j,8) # tp[j-1] 536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[0] 539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($np,$j,8),%rax 541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax # ap[0] 547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-16(%rsp,$j,8) # tp[j-1] 551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq %xmm0,$m0 # bp[1] 554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $N[1],$N[1] 556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$N[1] 558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-8(%rsp,$j,8) 559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],(%rsp,$j,8) # store upmost overflow bit 560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i # i++ 562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 4 563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter4x: 564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $j,$j # j=0 565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `0*$STRIDE/4-96`($bp),%xmm0 566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `1*$STRIDE/4-96`($bp),%xmm1 567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm4,%xmm0 568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `2*$STRIDE/4-96`($bp),%xmm2 569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm5,%xmm1 570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsp),$A[0] 572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $n0,$m1 573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[0]*bp[i] 574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] # ap[0]*bp[i]+tp[0] 575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np),%rax 576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `3*$STRIDE/4-96`($bp),%xmm3 579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm6,%xmm2 580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm1,%xmm0 581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm7,%xmm3 582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom imulq $A[0],$m1 # tp[0]*n0 584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm2,%xmm0 587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea $STRIDE($bp),$bp 588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm3,%xmm0 589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[0]*m1 591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] # "$N[0]", discarded 592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap),%rax 593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($np),%rax 599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8(%rsp),$A[1] # +tp[1] 601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($ap),%rax 607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($j),$j # j+=2 610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Linner4x 613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner4x: 615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($np,$j,8),%rax 618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($ap,$j,8),%rax 626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-32(%rsp,$j,8) # tp[j-1] 630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($np,$j,8),%rax 635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -8(%rsp,$j,8),$A[1] 637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap,$j,8),%rax 643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] 645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-24(%rsp,$j,8) # tp[j-1] 647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($np,$j,8),%rax 652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($ap,$j,8),%rax 660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-16(%rsp,$j,8) # tp[j-1] 664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($np,$j,8),%rax 669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add 8(%rsp,$j,8),$A[1] 671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($j),$j # j++ 673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($ap,$j,8),%rax 678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] 680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-40(%rsp,$j,8) # tp[j-1] 682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$j 684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jl .Linner4x 685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[0] 688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16($np,$j,8),%rax 689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[1] 693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[0] 696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($ap,$j,8),%rax 697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-32(%rsp,$j,8) # tp[j-1] 701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[1] 702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m0 # ap[j]*bp[i] 704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$A[1] 705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8($np,$j,8),%rax 706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add -8(%rsp,$j,8),$A[1] 708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 1($i),$i # i++ 710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$A[0] 711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mulq $m1 # np[j]*m1 713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add %rax,$N[1] 714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($ap),%rax # ap[0] 715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[1],$N[1] 717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,%rdx 718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-24(%rsp,$j,8) # tp[j-1] 719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdx,$N[0] 720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq %xmm0,$m0 # bp[i+1] 722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],-16(%rsp,$j,8) # tp[j-1] 723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $N[1],$N[1] 725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $A[0],$N[0] 726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$N[1] 727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add (%rsp,$num,8),$N[0] # pull upmost overflow bit 728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom adc \$0,$N[1] 729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[0],-8(%rsp,$j,8) 730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $N[1],(%rsp,$j,8) # store upmost overflow bit 731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp $num,$i 733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jl .Louter4x 734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ 736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @ri=("%rax","%rdx",$m0,$m1); 737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16(%rsp,$num,8),$rp # restore $rp 739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0(%rsp),@ri[0] # tp[0] 740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pxor %xmm0,%xmm0 741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsp),@ri[1] # tp[1] 742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$2,$num # num/=4 743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsp),$ap # borrow ap for tp 744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 and clear CF! 745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub 0($np),@ri[0] 747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16($ap),@ri[2] # tp[2] 748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24($ap),@ri[3] # tp[3] 749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 8($np),@ri[1] 750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -1($num),$j # j=num/4-1 751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lsub4x 752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsub4x: 754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 16($np,$i,8),@ri[2] 757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32($ap,$i,8),@ri[0] # tp[i+1] 758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40($ap,$i,8),@ri[1] 759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 24($np,$i,8),@ri[3] 760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 32($np,$i,8),@ri[0] 763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 48($ap,$i,8),@ri[2] 764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 56($ap,$i,8),@ri[3] 765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 40($np,$i,8),@ri[1] 766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 4($i),$i # i++ 767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dec $j # doesnn't affect CF! 768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lsub4x 769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32($ap,$i,8),@ri[0] # load overflow bit 772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 16($np,$i,8),@ri[2] 773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb 24($np,$i,8),@ri[3] 775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sbb \$0,@ri[0] # handle upmost overflow bit 778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $i,$i # i=0 780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and @ri[0],$ap 781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom not @ri[0] 782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $rp,$np 783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and @ri[0],$np 784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea -1($num),$j 785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $np,$ap # ap=borrow?tp:rp 786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu ($ap),%xmm1 788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,(%rsp) 789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm1,($rp) 790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lcopy4x 791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy4x: # copy or in-place refresh 793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 16($ap,$i),%xmm2 794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 32($ap,$i),%xmm1 795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,16(%rsp,$i) 796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm2,16($rp,$i) 797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,32(%rsp,$i) 798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm1,32($rp,$i) 799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 32($i),$i 800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dec $j 801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lcopy4x 802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shl \$2,$num 804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu 16($ap,$i),%xmm2 805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqa %xmm0,16(%rsp,$i) 806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movdqu %xmm2,16($rp,$i) 807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsp,$num,8),%rsi # restore %rsp 811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov \$1,%rax 812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64); 814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps (%rsi),%xmm6 815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps 0x10(%rsi),%xmm7 816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 0x28(%rsi),%rsi 817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov (%rsi),%r15 820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsi),%r14 821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16(%rsi),%r13 822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 24(%rsi),%r12 823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 32(%rsi),%rbp 824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40(%rsi),%rbx 825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 48(%rsi),%rsp 826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_epilogue: 827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ret 828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}}} 831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ 833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order 834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ("%rdi","%rsi","%rdx","%rcx"); # Unix order 835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $out=$inp; 836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $STRIDE=2**5*8; 837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $N=$STRIDE/4; 838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl bn_scatter5 841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type bn_scatter5,\@abi-omnipotent 842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_scatter5: 844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp \$0, $num 845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jz .Lscatter_epilogue 846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea ($tbl,$idx,8),$tbl 847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lscatter: 848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ($inp),%rax 849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 8($inp),$inp 850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,($tbl) 851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 32*8($tbl),$tbl 852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub \$1,$num 853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lscatter 854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lscatter_epilogue: 855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ret 856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size bn_scatter5,.-bn_scatter5 857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl bn_gather5 859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type bn_gather5,\@abi-omnipotent 860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_gather5: 862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64); 864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_begin_bn_gather5: 865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # I can't trust assembler to use specific encoding:-( 866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp 867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) 869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $idx,%r11 872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr \$`log($N/8)/log(2)`,$idx 873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$`$N/8-1`,%r11 874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom not $idx 875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea .Lmagic_masks(%rip),%rax 876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and \$`2**5/($N/8)-1`,$idx # 5 is "window size" 877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line 878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which 879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 8(%rax,$idx,8),%xmm5 # cache line contains element 880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument 881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq 24(%rax,$idx,8),%xmm7 882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jmp .Lgather 883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lgather: 885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `0*$STRIDE/4-96`($tbl),%xmm0 886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `1*$STRIDE/4-96`($tbl),%xmm1 887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm4,%xmm0 888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `2*$STRIDE/4-96`($tbl),%xmm2 889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm5,%xmm1 890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq `3*$STRIDE/4-96`($tbl),%xmm3 891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm6,%xmm2 892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm1,%xmm0 893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pand %xmm7,%xmm3 894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm2,%xmm0 895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea $STRIDE($tbl),$tbl 896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom por %xmm3,%xmm0 897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movq %xmm0,($out) # m0=bp[0] 899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 8($out),$out 900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub \$1,$num 901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jnz .Lgather 902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64); 904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps %xmm6,(%rsp) 905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps %xmm7,0x10(%rsp) 906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 0x28(%rsp),%rsp 907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ret 910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_end_bn_gather5: 911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size bn_gather5,.-bn_gather5 912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 64 916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmagic_masks: 917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .long 0,0, 0,0, 0,0, -1,-1 918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .long 0,0, 0,0, 0,0, 0,0 919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CONTEXT *context,DISPATCHER_CONTEXT *disp) 924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($win64) { 925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rec="%rcx"; 926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$frame="%rdx"; 927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$context="%r8"; 928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$disp="%r9"; 929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.extern __imp_RtlVirtualUnwind 932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type mul_handler,\@abi-omnipotent 933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16 934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommul_handler: 935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rsi 936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rdi 937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbx 938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %rbp 939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r12 940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r13 941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r14 942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push %r15 943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pushfq 944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub \$64,%rsp 945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 120($context),%rax # pull context->Rax 947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 248($context),%rbx # pull context->Rip 948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8($disp),%rsi # disp->ImageBase 950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 56($disp),%r11 # disp->HandlerData 951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0(%r11),%r10d # HandlerData[0] 953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsi,%r10),%r10 # end of prologue label 954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp %r10,%rbx # context->Rip<end of prologue label 955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jb .Lcommon_seh_tail 956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea `40+48`(%rax),%rax 958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 4(%r11),%r10d # HandlerData[1] 960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsi,%r10),%r10 # end of alloca label 961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp %r10,%rbx # context->Rip<end of alloca label 962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jb .Lcommon_seh_tail 963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 152($context),%rax # pull context->Rsp 965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%r11),%r10d # HandlerData[2] 967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea (%rsi,%r10),%r10 # epilogue label 968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp %r10,%rbx # context->Rip>=epilogue label 969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom jae .Lcommon_seh_tail 970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 192($context),%r10 # pull $num 972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rax,%r10,8),%rax # pull saved stack pointer 973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps (%rax),%xmm0 975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movaps 16(%rax),%xmm1 976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea `40+48`(%rax),%rax 977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -8(%rax),%rbx 979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -16(%rax),%rbp 980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -24(%rax),%r12 981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -32(%rax),%r13 982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -40(%rax),%r14 983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov -48(%rax),%r15 984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rbx,144($context) # restore context->Rbx 985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rbp,160($context) # restore context->Rbp 986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r12,216($context) # restore context->R12 987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r13,224($context) # restore context->R13 988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r14,232($context) # restore context->R14 989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r15,240($context) # restore context->R15 990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movups %xmm0,512($context) # restore context->Xmm6 991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom movups %xmm1,528($context) # restore context->Xmm7 992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcommon_seh_tail: 994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rax),%rdi 995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16(%rax),%rsi 996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rax,152($context) # restore context->Rsp 997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rsi,168($context) # restore context->Rsi 998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rdi,176($context) # restore context->Rdi 999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40($disp),%rdi # disp->ContextRecord 1001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $context,%rsi # context 1002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov \$154,%ecx # sizeof(CONTEXT) 1003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .long 0xa548f3fc # cld; rep movsq 1004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $disp,%rsi 1006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 8(%rsi),%rdx # arg2, disp->ImageBase 1008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 0(%rsi),%r8 # arg3, disp->ControlPc 1009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov 40(%rsi),%r10 # disp->ContextRecord 1011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 56(%rsi),%r11 # &disp->HandlerData 1012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom lea 24(%rsi),%r12 # &disp->EstablisherFrame 1013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r10,32(%rsp) # arg5 1014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r11,40(%rsp) # arg6 1015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %r12,48(%rsp) # arg7 1016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov %rcx,56(%rsp) # arg8, (NULL) 1017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom call *__imp_RtlVirtualUnwind(%rip) 1018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov \$1,%eax # ExceptionContinueSearch 1020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add \$64,%rsp 1021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom popfq 1022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pop %r15 1023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pop %r14 1024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pop %r13 1025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pop %r12 1026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pop %rbp 1027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pop %rbx 1028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pop %rdi 1029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom pop %rsi 1030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ret 1031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size mul_handler,.-mul_handler 1032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.section .pdata 1034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 4 1035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_begin_bn_mul_mont_gather5 1036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_end_bn_mul_mont_gather5 1037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_info_bn_mul_mont_gather5 1038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_begin_bn_mul4x_mont_gather5 1040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_end_bn_mul4x_mont_gather5 1041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_info_bn_mul4x_mont_gather5 1042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_begin_bn_gather5 1044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_end_bn_gather5 1045392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .LSEH_info_bn_gather5 1046392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1047392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.section .xdata 1048392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 8 1049392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_bn_mul_mont_gather5: 1050392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 9,0,0,0 1051392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva mul_handler 1052392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] 1053392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 8 1054392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_bn_mul4x_mont_gather5: 1055392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 9,0,0,0 1056392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva mul_handler 1057392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1058392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 8 1059392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_bn_gather5: 1060392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 0x01,0x0d,0x05,0x00 1061392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1062392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 1063392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 1064392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 8 1065392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 1066392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 1067392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1068392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1069392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 1070392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprint $code; 1071392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT; 1072