1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl 2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and 6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further 7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/. 8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# January 2010 11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "Teaser" Montgomery multiplication module for IA-64. There are 13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# several possibilities for improvement: 14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - modulo-scheduling outer loop would eliminate quite a number of 16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# stalls after ldf8, xma and getf.sig outside inner loop and 17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# improve shorter key performance; 18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - shorter vector support [with input vectors being fetched only 19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# once] should be added; 20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - 2x unroll with help of n0[1] would make the code scalable on 21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "wider" IA-64, "wider" than Itanium 2 that is, which is not of 22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# acute interest, because upcoming Tukwila's individual cores are 23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# reportedly based on Itanium 2 design; 24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - dedicated squaring procedure(?); 25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# January 2010 27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Shorter vector support is implemented by zero-padding ap and np 29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# vectors up to 8 elements, or 512 bits. This means that 256-bit 30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# inputs will be processed only 2 times faster than 512-bit inputs, 31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# not 4 [as one would expect, because algorithm complexity is n^2]. 32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The reason for padding is that inputs shorter than 512 bits won't 33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# be processed faster anyway, because minimal critical path of the 34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# core loop happens to match 512-bit timing. Either way, it resulted 35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in >100% improvement of 512-bit RSA sign benchmark and 50% - of 36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 1024-bit one [in comparison to original version of *this* module]. 37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* 39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# this module is: 40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# sign verify sign/s verify/s 41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ... and *without* (but still with ia64.S): 50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# As it can be seen, RSA sign performance improves by 130-30%, 60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# hereafter less for longer keys, while verify - by 74-13%. 61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# DSA performance improves by 115-30%. 62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($^O eq "hpux") { 64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP="addp4"; 65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} else { $ADDP="add"; } 67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code=<<___; 69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.explicit 70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.text 71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, 73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// const BN_ULONG *bp,const BN_ULONG *np, 74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// const BN_ULONG *n0p,int num); 75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 64 76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.global bn_mul_mont# 77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc bn_mul_mont# 78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul_mont: 79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .prologue 80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .body 81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; cmp4.le p6,p7=2,r37;; 82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) cmp4.lt.unc p8,p9=8,r37 83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ret0=r0 };; 84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .bbb; 85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p9) br.cond.dptk.many bn_mul_mont_8 86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p8) br.cond.dpnt.many bn_mul_mont_general 87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p7) br.ret.spnt.many b0 };; 88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp bn_mul_mont# 89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; 91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrptr=r8; aptr=r9; bptr=r14; nptr=r15; 93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromtptr=r16; // &tp[0] 94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromtp_1=r17; // &tp[-1] 95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromnum=r18; len=r19; lc=r20; 96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromtopbit=r21; // carry bit from tmp[num] 97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromn0=f6; 99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromm0=f7; 100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombi=f8; 101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 64 103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.local bn_mul_mont_general# 104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc bn_mul_mont_general# 105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul_mont_general: 106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .prologue 107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .save ar.pfs,prevfs 108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom alloc prevfs=ar.pfs,6,2,0,8 109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP aptr=0,in1 110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save ar.lc,prevlc 111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevlc=ar.lc } 112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .vframe prevsp 113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevsp=sp 114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP bptr=0,in2 115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save pr,prevpr 116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevpr=pr };; 117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .body 119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rotf alo[6],nlo[4],ahi[8],nhi[6] 120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rotr a[3],n[3],t[2] 121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 bi=[bptr],8 // (*bp++) 123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 alo[4]=[aptr],16 // ap[0] 124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP r30=8,in1 };; 125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 alo[3]=[r30],16 // ap[1] 126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 alo[2]=[aptr],16 // ap[2] 127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP in4=0,in4 };; 128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 alo[1]=[r30] // ap[3] 129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 n0=[in4] // n0 130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP rptr=0,in0 } 131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; $ADDP nptr=0,in3 132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov r31=16 133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom zxt4 num=in5 };; 134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0] 135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shladd len=num,3,r0 136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shladd r31=num,3,r31 };; 137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1] 138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add lc=-5,num 139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub r31=sp,r31 };; 140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb; and sp=-16,r31 // alloca 141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] 142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop.b 0 } 143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb; nop.m 0 144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpy.lu alo[4]=alo[4],bi 145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom brp.loop.imp .L1st_ctop,.L1st_cend-16 146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom };; 147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; nop.m 0 148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] 149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add tp_1=8,sp } 150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; nop.m 0 151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xma.lu alo[3]=alo[3],bi,ahi[2] 152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr.rot=0x20001f<<16 153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom // ------^----- (p40) at first (p23) 154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom // ----------^^ p[16:20]=1 155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom };; 156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; nop.m 0 157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=lc } 159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; nop.m 0 160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcvt.fxu.s1 nhi[1]=f0 161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.ec=8 };; 162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 32 164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st_ctop: 165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.pred.rel "mutex",p40,p42 166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add n[2]=n[2],a[2] } // (p23) } 169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) 170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) add n[2]=n[2],a[2],1 };; // (p23) 172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p21) getf.sig a[0]=alo[5] 173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) 175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p23) st8 [tp_1]=n[2],8 176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p21) getf.sig n[0]=nlo[3] 179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.m 0 180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ctop.sptk .L1st_ctop };; 181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st_cend: 182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; getf.sig a[0]=ahi[6] // (p24) 184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom getf.sig n[0]=nhi[4] 185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add num=-1,num };; // num-- 186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .pred.rel "mutex",p40,p42 187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p40) add n[0]=n[0],a[0] 188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p42) add n[0]=n[0],a[0],1 189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub aptr=aptr,len };; // rewind 190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .pred.rel "mutex",p40,p42 191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p40) cmp.ltu p41,p39=n[0],a[0] 192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p42) cmp.leu p41,p39=n[0],a[0] 193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub nptr=nptr,len };; 194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .pred.rel "mutex",p39,p41 195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p39) add topbit=r0,r0 196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p41) add topbit=r0,r0,1 197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop.i 0 } 198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; st8 [tp_1]=n[0] 199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add tptr=16,sp 200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add tp_1=8,sp };; 201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter: 203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 bi=[bptr],8 // (*bp++) 204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 ahi[3]=[tptr] // tp[0] 205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add r30=8,aptr };; 206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0] 207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 alo[3]=[r30],16 // ap[1] 208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add r31=8,nptr };; 209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2] 210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] 211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom brp.loop.imp .Linner_ctop,.Linner_cend-16 212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb; ldf8 alo[1]=[r30] // ap[3] 214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xma.lu alo[4]=alo[4],bi,ahi[3] 215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom clrrrb.pr };; 216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0] 217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] 218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop.i 0 } 219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ldf8 nlo[1]=[r31] // np[1] 220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xma.lu alo[3]=alo[3],bi,ahi[2] 221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr.rot=0x20101f<<16 222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom // ------^----- (p40) at first (p23) 223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom // --------^--- (p30) at first (p22) 224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom // ----------^^ p[16:20]=1 225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom };; 226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted 227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=lc } 229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; 230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcvt.fxu.s1 nhi[1]=f0 231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.ec=8 };; 232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in 234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// in latter case accounts for two-tick pipeline stall, which means 236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// that its performance would be ~20% lower than optimal one. No 237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// attempt was made to address this, because original Itanium is 238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// hardly represented out in the wild... 239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 32 240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner_ctop: 241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.pred.rel "mutex",p40,p42 242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.pred.rel "mutex",p30,p32 243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add n[2]=n[2],a[2] } // (p23) 246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) add n[2]=n[2],a[2],1 };; // (p23) 249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p21) getf.sig a[0]=alo[5] 250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.f 0 251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p21) ld8 t[0]=[tptr],8 253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.f 0 254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) 255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) 256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p30) add a[1]=a[1],t[1] } // (p22) 258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p32) add a[1]=a[1],t[1],1 };; // (p22) 261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p21) getf.sig n[0]=nlo[3] 262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.m 0 263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) 264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p23) st8 [tp_1]=n[2],8 265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p32) cmp.leu p31,p29=a[1],t[1] // (p22) 266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ctop.sptk .Linner_ctop };; 267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner_cend: 268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; getf.sig a[0]=ahi[6] // (p24) 270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom getf.sig n[0]=nhi[4] 271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop.i 0 };; 272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .pred.rel "mutex",p31,p33 274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p31) add a[0]=a[0],topbit 275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p33) add a[0]=a[0],topbit,1 276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov topbit=r0 };; 277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; .pred.rel "mutex",p31,p33 278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p31) cmp.ltu p32,p30=a[0],topbit 279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p33) cmp.leu p32,p30=a[0],topbit 280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; .pred.rel "mutex",p40,p42 282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p40) add n[0]=n[0],a[0] 283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p42) add n[0]=n[0],a[0],1 284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom };; 285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .pred.rel "mutex",p44,p46 286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p40) cmp.ltu p41,p39=n[0],a[0] 287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p42) cmp.leu p41,p39=n[0],a[0] 288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p32) add topbit=r0,r0,1 } 289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; st8 [tp_1]=n[0],8 291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp4.ne p6,p0=1,num 292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub aptr=aptr,len };; // rewind 293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; sub nptr=nptr,len 294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p41) add topbit=r0,r0,1 295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add tptr=16,sp } 296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; add tp_1=8,sp 297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add num=-1,num // num-- 298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) br.cond.sptk.many .Louter };; 299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mbb; add lc=4,lc 301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom brp.loop.imp .Lsub_ctop,.Lsub_cend-16 302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom clrrrb.pr };; 303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; nop.m 0 304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr.rot=0x10001<<16 305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom // ------^---- (p33) at first (p17) 306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=lc } 307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; nop.m 0 308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.ec=3 309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop.i 0 };; 310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsub_ctop: 312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.pred.rel "mutex",p33,p35 313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) 314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.f 0 315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p33) sub n[1]=t[1],n[1] } // (p17) 316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) 317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.f 0 318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p35) sub n[1]=t[1],n[1],1 };; // (p17) 319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r 320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) 321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) nop.b 0 } 322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; (p18) nop.m 0 323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p35) cmp.geu p34,p32=n[1],t[1] // (p17) 324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ctop.sptk .Lsub_ctop };; 325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsub_cend: 326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; .pred.rel "mutex",p34,p36 328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p34) sub topbit=topbit,r0 // (p19) 329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p36) sub topbit=topbit,r0,1 330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; sub rptr=rptr,len // rewind 333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub tptr=tptr,len 334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom clrrrb.pr };; 335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; and aptr=tptr,topbit 336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom andcm bptr=rptr,topbit 337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr.rot=1<<16 };; 338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; or nptr=aptr,bptr 339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=lc 340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.ec=3 };; 341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy_ctop: 343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p16) ld8 n[0]=[nptr],8 344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) st8 [tptr]=r0,8 345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.b 0 } 346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p16) nop.m 0 347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) st8 [rptr]=n[2],8 348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ctop.sptk .Lcopy_ctop };; 349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy_cend: 350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; mov ret0=1 // signal "handled" 352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom rum 1<<5 // clear um.mfh 353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=prevlc } 354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; .restore sp 355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov sp=prevsp 356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr=prevpr,0x1ffff 357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ret.sptk.many b0 };; 358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp bn_mul_mont_general# 359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstroma1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; 361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromn1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; 362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromt0=r15; 363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; 365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; 366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 64 368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.skip 48 // aligns loop body 369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.local bn_mul_mont_8# 370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc bn_mul_mont_8# 371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul_mont_8: 372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .prologue 373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .save ar.pfs,prevfs 374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom alloc prevfs=ar.pfs,6,2,0,8 375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .vframe prevsp 376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevsp=sp 377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save ar.lc,prevlc 378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevlc=ar.lc } 379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; add r17=-6*16,sp 380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add sp=-7*16,sp 381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save pr,prevpr 382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevpr=pr };; 383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .save.gf 0,0x10 385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf.spill [sp]=f16,-16 386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save.gf 0,0x20 387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf.spill [r17]=f17,32 388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add r16=-5*16,prevsp};; 389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .save.gf 0,0x40 390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf.spill [r16]=f18,32 391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save.gf 0,0x80 392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf.spill [r17]=f19,32 393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP aptr=0,in1 };; 394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .save.gf 0,0x100 395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf.spill [r16]=f20,32 396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save.gf 0,0x200 397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf.spill [r17]=f21,32 398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP r29=8,in1 };; 399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .save.gf 0,0x400 400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf.spill [r16]=f22 401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save.gf 0,0x800 402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf.spill [r17]=f23 403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP rptr=0,in0 };; 404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .body 406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] 407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rotr t[8] 408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// load input vectors padding them to 8 elements 410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 ai0=[aptr],16 // ap[0] 411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 ai1=[r29],16 // ap[1] 412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP bptr=0,in2 } 413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; $ADDP r30=8,in2 414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP nptr=0,in3 415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP r31=8,in3 };; 416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0] 417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 bj[6]=[r30],16 // bp[1] 418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp4.le p4,p5=3,in5 } 419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 ni0=[nptr],16 // np[0] 420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 ni1=[r31],16 // np[1] 421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp4.le p6,p7=4,in5 };; 422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] 424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p5)fcvt.fxu ai2=f0 425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp4.le p8,p9=5,in5 } 426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] 427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p7)fcvt.fxu ai3=f0 428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp4.le p10,p11=6,in5 } 429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] 430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p5)fcvt.fxu bj[5]=f0 431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp4.le p12,p13=7,in5 } 432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] 433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p7)fcvt.fxu bj[4]=f0 434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp4.le p14,p15=8,in5 } 435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] 436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p5)fcvt.fxu ni2=f0 437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addp4 r28=-1,in5 } 438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3] 439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p7)fcvt.fxu ni3=f0 440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP in4=0,in4 };; 441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ldf8 n0=[in4] 443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcvt.fxu tf[1]=f0 444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop.i 0 } 445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] 447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p9)fcvt.fxu ai4=f0 448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov t[0]=r0 } 449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] 450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p11)fcvt.fxu ai5=f0 451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov t[1]=r0 } 452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] 453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p9)fcvt.fxu bj[3]=f0 454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov t[2]=r0 } 455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] 456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p11)fcvt.fxu bj[2]=f0 457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov t[3]=r0 } 458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] 459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p9)fcvt.fxu ni4=f0 460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov t[4]=r0 } 461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5] 462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p11)fcvt.fxu ni5=f0 463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov t[5]=r0 };; 464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] 466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p13)fcvt.fxu ai6=f0 467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov t[6]=r0 } 468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] 469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p15)fcvt.fxu ai7=f0 470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov t[7]=r0 } 471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] 472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p13)fcvt.fxu bj[1]=f0 473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=r28 } 474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] 475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p15)fcvt.fxu bj[0]=f0 476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.ec=1 } 477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] 478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p13)fcvt.fxu ni6=f0 479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr.rot=1<<16 } 480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7] 481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p15)fcvt.fxu ni7=f0 482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom };; 484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt 486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// to measure with help of Interval Time Counter indicated that the 487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// factor is a tad higher: 33 or 34, if not 35. Exact measurement and 488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// addressing the issue is problematic, because I don't have access 489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// to platform-specific instruction-level profiler. On Itanium it 490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// should run in 56*n ticks, because of higher xma latency... 491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter_8_ctop: 492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p40,p42 493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p48,p50 494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 0: 495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] 496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add a3=a3,n3 } // (p17) a3+=n3 497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) add a3=a3,n3,1 498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu alo[0]=ai0,bj[7],tf[1] 499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig a7=alo[8] // 1: 501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) add t[6]=t[6],a3,1 };; 503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p17) getf.sig a8=ahi[8] // 2: 504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p43,p41=a3,n3 } 506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) cmp.leu p43,p41=a3,n3 507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig n5=nlo[6] // 3: 510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) cmp.ltu p51,p49=t[6],a3 511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) cmp.leu p51,p49=t[6],a3 };; 512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p41,p43 513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p49,p51 514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 4: 515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] 516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) add a4=a4,n4 } // (p17) a4+=n4 517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p43) add a4=a4,n4,1 518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] 519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) add t[5]=t[5],a4,1 };; 523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 6: 524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) cmp.ltu p42,p40=a4,n4 } 526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p43) cmp.leu p42,p40=a4,n4 527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig n6=nlo[7] // 7: 530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) cmp.ltu p50,p48=t[5],a4 531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) cmp.leu p50,p48=t[5],a4 };; 532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p40,p42 533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p48,p50 534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 8: 535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] 536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add a5=a5,n5 } // (p17) a5+=n5 537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) add a5=a5,n5,1 538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] 539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig a1=alo[1] // 9: 541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) add t[4]=t[4],a5,1 };; 543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 10: 544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p43,p41=a5,n5 } 546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) cmp.leu p43,p41=a5,n5 547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] 548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig n7=nlo[8] // 11: 550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) cmp.ltu p51,p49=t[4],a5 551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) cmp.leu p51,p49=t[4],a5 };; 552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p41,p43 553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p49,p51 554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p17) getf.sig n8=nhi[8] // 12: 555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] 556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) add a6=a6,n6 } // (p17) a6+=n6 557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p43) add a6=a6,n6,1 558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] 559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig a2=alo[2] // 13: 561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) add t[3]=t[3],a6,1 };; 563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 14: 564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) cmp.ltu p42,p40=a6,n6 } 566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p43) cmp.leu p42,p40=a6,n6 567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] 568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) nop.m 0 // 15: 570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) cmp.ltu p50,p48=t[3],a6 571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) cmp.leu p50,p48=t[3],a6 };; 572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p40,p42 573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p48,p50 574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 16: 575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] 576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add a7=a7,n7 } // (p17) a7+=n7 577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) add a7=a7,n7,1 578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] 579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig a3=alo[3] // 17: 581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) add t[2]=t[2],a7,1 };; 583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 18: 584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p43,p41=a7,n7 } 586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) cmp.leu p43,p41=a7,n7 587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] 588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig n1=nlo[1] // 19: 590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) cmp.ltu p51,p49=t[2],a7 591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) cmp.leu p51,p49=t[2],a7 };; 592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p41,p43 593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p49,p51 594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 20: 595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] 596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) add a8=a8,n8 } // (p17) a8+=n8 597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p43) add a8=a8,n8,1 598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] 599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig a4=alo[4] // 21: 601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) add t[1]=t[1],a8,1 };; 603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 22: 604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) cmp.ltu p42,p40=a8,n8 } 606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p43) cmp.leu p42,p40=a8,n8 607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] 608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig n2=nlo[2] // 23: 610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) cmp.ltu p50,p48=t[1],a8 611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) cmp.leu p50,p48=t[1],a8 };; 612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 24: 613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] 614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) add a1=a1,n1 } // (p16) a1+=n1 615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] 617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) mov t[0]=r0 };; 618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig a5=alo[5] // 25: 619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) add t0=t[7],a1 // (p16) t[7]+=a1 620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) add t[0]=t[0],r0,1 };; 621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) setf.sig tf[0]=t0 // 26: 622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) add t[0]=t[0],r0,1 } 624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] 626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig n3=nlo[3] // 27: 628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) cmp.ltu.unc p50,p48=t0,a1 629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p40,p42 631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p48,p50 632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 28: 633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] 634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add a2=a2,n2 } // (p16) a2+=n2 635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) add a2=a2,n2,1 636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] 637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p16) getf.sig a6=alo[6] // 29: 639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) add t[6]=t[6],a2,1 };; 641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) nop.m 0 // 30: 642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p41,p39=a2,n2 } 644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) cmp.leu p41,p39=a2,n2 645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] 646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.i 0 };; 647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p16) getf.sig n4=nlo[4] // 31: 648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.f 0 649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) cmp.ltu p49,p47=t[6],a2 } 650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb; (p50) cmp.leu p49,p47=t[6],a2 651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p16) nop.f 0 652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ctop.sptk.many .Louter_8_ctop };; 653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter_8_cend: 654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// above loop has to execute one more time, without (p16), which is 656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// replaced with merged move of np[8] to GPR bank 657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p40,p42 658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p48,p50 659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p0) getf.sig n1=ni0 // 0: 660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add a3=a3,n3 // (p17) a3+=n3 661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) add a3=a3,n3,1 };; 662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig a7=alo[8] // 1: 663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) add t[6]=t[6],a3,1 };; 665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p17) getf.sig a8=ahi[8] // 2: 666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p43,p41=a3,n3 } 668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p42) cmp.leu p43,p41=a3,n3 669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p0) nop.i 0 };; 671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig n5=nlo[6] // 3: 672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) cmp.ltu p51,p49=t[6],a3 673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) cmp.leu p51,p49=t[6],a3 };; 674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p41,p43 675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p49,p51 676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p0) getf.sig n2=ni1 // 4: 677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) add a4=a4,n4 // (p17) a4+=n4 678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p43) add a4=a4,n4,1 };; 679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p0) nop.f 0 681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) add t[5]=t[5],a4,1 };; 682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p0) getf.sig n3=ni2 // 6: 683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) cmp.ltu p42,p40=a4,n4 } 685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p43) cmp.leu p42,p40=a4,n4 686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p0) nop.i 0 };; 688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig n6=nlo[7] // 7: 689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) cmp.ltu p50,p48=t[5],a4 690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) cmp.leu p50,p48=t[5],a4 };; 691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p40,p42 692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p48,p50 693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) getf.sig n4=ni3 // 8: 694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add a5=a5,n5 // (p17) a5+=n5 695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) add a5=a5,n5,1 };; 696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) nop.m 0 // 9: 697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) add t[4]=t[4],a5,1 };; 699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) nop.m 0 // 10: 700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p43,p41=a5,n5 701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) cmp.leu p43,p41=a5,n5 };; 702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig n7=nlo[8] // 11: 703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) cmp.ltu p51,p49=t[4],a5 704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) cmp.leu p51,p49=t[4],a5 };; 705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p41,p43 706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p49,p51 707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p17) getf.sig n8=nhi[8] // 12: 708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) add a6=a6,n6 // (p17) a6+=n6 709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p43) add a6=a6,n6,1 };; 710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) getf.sig n5=ni4 // 13: 711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) add t[3]=t[3],a6,1 };; 713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) nop.m 0 // 14: 714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) cmp.ltu p42,p40=a6,n6 715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p43) cmp.leu p42,p40=a6,n6 };; 716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) getf.sig n6=ni5 // 15: 717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) cmp.ltu p50,p48=t[3],a6 718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) cmp.leu p50,p48=t[3],a6 };; 719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p40,p42 720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p48,p50 721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) nop.m 0 // 16: 722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) add a7=a7,n7 // (p17) a7+=n7 723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) add a7=a7,n7,1 };; 724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) nop.m 0 // 17: 725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) add t[2]=t[2],a7,1 };; 727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) nop.m 0 // 18: 728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p40) cmp.ltu p43,p41=a7,n7 729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p42) cmp.leu p43,p41=a7,n7 };; 730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) getf.sig n7=ni6 // 19: 731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p48) cmp.ltu p51,p49=t[2],a7 732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p50) cmp.leu p51,p49=t[2],a7 };; 733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p41,p43 734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p49,p51 735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p0) nop.m 0 // 20: 736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) add a8=a8,n8 // (p17) a8+=n8 737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p43) add a8=a8,n8,1 };; 738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p0) nop.m 0 // 21: 739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) add t[1]=t[1],a8,1 } 741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p17) mov t[0]=r0 742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p41) cmp.ltu p42,p40=a8,n8 743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p43) cmp.leu p42,p40=a8,n8 };; 744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p0) getf.sig n8=ni7 // 22: 745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p49) cmp.ltu p50,p48=t[1],a8 746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p51) cmp.leu p50,p48=t[1],a8 } 747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p42) add t[0]=t[0],r0,1 748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p0) add r16=-7*16,prevsp 749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p0) add r17=-6*16,prevsp };; 750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// subtract np[8] from carrybit|tmp[8] 752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// carrybit|tmp[8] layout upon exit from above loop is: 753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) 754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p50)add t[0]=t[0],r0,1 755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add r18=-5*16,prevsp 756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub n1=t0,n1 };; 757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; cmp.gtu p34,p32=n1,t0;; 758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p32,p34 759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p32)sub n2=t[7],n2 760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)sub n2=t[7],n2,1 };; 761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p32)cmp.gtu p35,p33=n2,t[7] 762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)cmp.geu p35,p33=n2,t[7];; 763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p33,p35 764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p33)sub n3=t[6],n3 } 765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p35)sub n3=t[6],n3,1;; 766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p33)cmp.gtu p34,p32=n3,t[6] 767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p35)cmp.geu p34,p32=n3,t[6] };; 768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p32,p34 769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p32)sub n4=t[5],n4 770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)sub n4=t[5],n4,1;; 771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p32)cmp.gtu p35,p33=n4,t[5] } 772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];; 773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p33,p35 774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p33)sub n5=t[4],n5 775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p35)sub n5=t[4],n5,1 };; 776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p33)cmp.gtu p34,p32=n5,t[4] 777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p35)cmp.geu p34,p32=n5,t[4];; 778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p32,p34 779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p32)sub n6=t[3],n6 } 780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p34)sub n6=t[3],n6,1;; 781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p32)cmp.gtu p35,p33=n6,t[3] 782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)cmp.geu p35,p33=n6,t[3] };; 783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p33,p35 784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p33)sub n7=t[2],n7 785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p35)sub n7=t[2],n7,1;; 786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p33)cmp.gtu p34,p32=n7,t[2] } 787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];; 788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p32,p34 789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p32)sub n8=t[1],n8 790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)sub n8=t[1],n8,1 };; 791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; (p32)cmp.gtu p35,p33=n8,t[1] 792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)cmp.geu p35,p33=n8,t[1];; 793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p33,p35 794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p33)sub a8=t[0],r0 } 795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p35)sub a8=t[0],r0,1;; 796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p33)cmp.gtu p34,p32=a8,t[0] 797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p35)cmp.geu p34,p32=a8,t[0] };; 798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// save the result, either tmp[num] or tmp[num]-np[num] 800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .pred.rel "mutex",p32,p34 801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p32)st8 [rptr]=n1,8 802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)st8 [rptr]=t0,8 803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add r19=-4*16,prevsp};; 804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p32)st8 [rptr]=n2,8 805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)st8 [rptr]=t[7],8 806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p5)br.cond.dpnt.few .Ldone };; 807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p32)st8 [rptr]=n3,8 808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)st8 [rptr]=t[6],8 809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p7)br.cond.dpnt.few .Ldone };; 810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p32)st8 [rptr]=n4,8 811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)st8 [rptr]=t[5],8 812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p9)br.cond.dpnt.few .Ldone };; 813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p32)st8 [rptr]=n5,8 814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)st8 [rptr]=t[4],8 815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p11)br.cond.dpnt.few .Ldone };; 816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p32)st8 [rptr]=n6,8 817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)st8 [rptr]=t[3],8 818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p13)br.cond.dpnt.few .Ldone };; 819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p32)st8 [rptr]=n7,8 820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)st8 [rptr]=t[2],8 821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p15)br.cond.dpnt.few .Ldone };; 822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb; (p32)st8 [rptr]=n8,8 823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p34)st8 [rptr]=t[1],8 824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop.b 0 };; 825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Ldone: // epilogue 826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf.fill f16=[r16],64 827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf.fill f17=[r17],64 828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop.i 0 } 829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf.fill f18=[r18],64 830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf.fill f19=[r19],64 831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr=prevpr,0x1ffff };; 832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf.fill f20=[r16] 833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf.fill f21=[r17] 834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=prevlc } 835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf.fill f22=[r18] 836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf.fill f23=[r19] 837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ret0=1 } // signal "handled" 838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; rum 1<<5 839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .restore sp 840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov sp=prevsp 841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ret.sptk.many b0 };; 842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp bn_mul_mont_8# 843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type copyright#,\@object 845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromcopyright: 846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromstringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" 847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$output=shift and open STDOUT,">$output"; 850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprint $code; 851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT; 852