1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl
2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and
6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further
7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/.
8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# January 2010
11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "Teaser" Montgomery multiplication module for IA-64. There are
13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# several possibilities for improvement:
14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - modulo-scheduling outer loop would eliminate quite a number of
16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   stalls after ldf8, xma and getf.sig outside inner loop and
17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   improve shorter key performance;
18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - shorter vector support [with input vectors being fetched only
19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   once] should be added;
20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - 2x unroll with help of n0[1] would make the code scalable on
21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   acute interest, because upcoming Tukwila's individual cores are
23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   reportedly based on Itanium 2 design;
24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - dedicated squaring procedure(?);
25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# January 2010
27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Shorter vector support is implemented by zero-padding ap and np
29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# vectors up to 8 elements, or 512 bits. This means that 256-bit
30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# inputs will be processed only 2 times faster than 512-bit inputs,
31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# not 4 [as one would expect, because algorithm complexity is n^2].
32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The reason for padding is that inputs shorter than 512 bits won't
33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# be processed faster anyway, because minimal critical path of the
34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# core loop happens to match 512-bit timing. Either way, it resulted
35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 1024-bit one [in comparison to original version of *this* module].
37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# this module is:
40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#                   sign    verify    sign/s verify/s
41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ... and *without* (but still with ia64.S):
50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# As it can be seen, RSA sign performance improves by 130-30%,
60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# hereafter less for longer keys, while verify - by 74-13%.
61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# DSA performance improves by 115-30%.
62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($^O eq "hpux") {
64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $ADDP="addp4";
65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} else { $ADDP="add"; }
67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code=<<___;
69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.explicit
70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.text
71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom//		    const BN_ULONG *bp,const BN_ULONG *np,
74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom//		    const BN_ULONG *n0p,int num);
75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.global	bn_mul_mont#
77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc	bn_mul_mont#
78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul_mont:
79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.prologue
80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.body
81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	cmp4.le		p6,p7=2,r37;;
82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	cmp4.lt.unc	p8,p9=8,r37
83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ret0=r0		};;
84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .bbb;
85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p9)	br.cond.dptk.many	bn_mul_mont_8
86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p8)	br.cond.dpnt.many	bn_mul_mont_general
87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p7)	br.ret.spnt.many	b0	};;
88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp	bn_mul_mont#
89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromtptr=r16;	// &tp[0]
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromtp_1=r17;	// &tp[-1]
95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromnum=r18;	len=r19;	lc=r20;
96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromtopbit=r21;	// carry bit from tmp[num]
97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromn0=f6;
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromm0=f7;
100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombi=f8;
101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.local	bn_mul_mont_general#
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc	bn_mul_mont_general#
105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul_mont_general:
106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.prologue
107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.save	ar.pfs,prevfs
108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	alloc	prevfs=ar.pfs,6,2,0,8
109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP	aptr=0,in1
110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save	ar.lc,prevlc
111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	prevlc=ar.lc		}
112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.vframe	prevsp
113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	prevsp=sp
114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP	bptr=0,in2
115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save	pr,prevpr
116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	prevpr=pr		};;
117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.body
119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rotr		a[3],n[3],t[2]
121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8		alo[4]=[aptr],16	// ap[0]
124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		r30=8,in1	};;
125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8		alo[2]=[aptr],16	// ap[2]
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		in4=0,in4	};;
128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8		n0=[in4]		// n0
130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		rptr=0,in0		}
131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	$ADDP		nptr=0,in3
132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		r31=16
133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	zxt4		num=in5		};;
134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shladd		len=num,3,r0
136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shladd		r31=num,3,r31	};;
137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		lc=-5,num
139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub		r31=sp,r31	};;
140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb;	and		sp=-16,r31		// alloca
141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	nop.b		0		}
143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb;	nop.m		0
144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xmpy.lu		alo[4]=alo[4],bi
145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	brp.loop.imp	.L1st_ctop,.L1st_cend-16
146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					};;
147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	nop.m		0
148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		tp_1=8,sp	}
150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	nop.m		0
151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xma.lu		alo[3]=alo[3],bi,ahi[2]
152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		pr.rot=0x20001f<<16
153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			// ------^----- (p40) at first (p23)
154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			// ----------^^ p[16:20]=1
155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					};;
156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	nop.m		0
157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.lc=lc	}
159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	nop.m		0
160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	fcvt.fxu.s1	nhi[1]=f0
161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.ec=8		};;
162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	32
164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st_ctop:
165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.pred.rel	"mutex",p40,p42
166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p23)	st8		[tp_1]=n[2],8
176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.m		0
180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ctop.sptk	.L1st_ctop			};;
181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st_cend:
182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	getf.sig	n[0]=nhi[4]
185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		num=-1,num	};;	// num--
186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.pred.rel	"mutex",p40,p42
187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p40)	add		n[0]=n[0],a[0]
188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p42)	add		n[0]=n[0],a[0],1
189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub		aptr=aptr,len	};;	// rewind
190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.pred.rel	"mutex",p40,p42
191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p40)	cmp.ltu		p41,p39=n[0],a[0]
192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p42)	cmp.leu		p41,p39=n[0],a[0]
193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub		nptr=nptr,len	};;
194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.pred.rel	"mutex",p39,p41
195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p39)	add		topbit=r0,r0
196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p41)	add		topbit=r0,r0,1
197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	nop.i		0		}
198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	st8		[tp_1]=n[0]
199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		tptr=16,sp
200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		tp_1=8,sp	};;
201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter:
203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8		ahi[3]=[tptr]		// tp[0]
205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		r30=8,aptr	};;
206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8		alo[3]=[r30],16		// ap[1]
208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		r31=8,nptr	};;
209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	brp.loop.imp	.Linner_ctop,.Linner_cend-16
212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					}
213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xma.lu		alo[4]=alo[4],bi,ahi[3]
215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	clrrrb.pr			};;
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	nop.i		0		}
219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xma.lu		alo[3]=alo[3],bi,ahi[2]
221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		pr.rot=0x20101f<<16
222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			// ------^----- (p40) at first (p23)
223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			// --------^--- (p30) at first (p22)
224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			// ----------^^ p[16:20]=1
225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					};;
226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.lc=lc	}
229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;
230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	fcvt.fxu.s1	nhi[1]=f0
231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.ec=8		};;
232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// in latter case accounts for two-tick pipeline stall, which means
236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// that its performance would be ~20% lower than optimal one. No
237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// attempt was made to address this, because original Itanium is
238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// hardly represented out in the wild...
239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	32
240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner_ctop:
241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.pred.rel	"mutex",p40,p42
242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.pred.rel	"mutex",p30,p32
243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0
247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.f		0
251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p21)	ld8		t[0]=[tptr],8
253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.f		0
254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0
259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.m		0
263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p23)	st8		[tp_1]=n[2],8
265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ctop.sptk	.Linner_ctop			};;
267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner_cend:
268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	getf.sig	n[0]=nhi[4]
271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	nop.i		0		};;
272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.pred.rel	"mutex",p31,p33
274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p31)	add		a[0]=a[0],topbit
275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p33)	add		a[0]=a[0],topbit,1
276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		topbit=r0	};;
277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; .pred.rel	"mutex",p31,p33
278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p31)	cmp.ltu		p32,p30=a[0],topbit
279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p33)	cmp.leu		p32,p30=a[0],topbit
280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					}
281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	.pred.rel	"mutex",p40,p42
282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p40)	add		n[0]=n[0],a[0]
283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p42)	add		n[0]=n[0],a[0],1
284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					};;
285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.pred.rel	"mutex",p44,p46
286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p40)	cmp.ltu		p41,p39=n[0],a[0]
287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p42)	cmp.leu		p41,p39=n[0],a[0]
288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p32)	add		topbit=r0,r0,1	}
289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	st8		[tp_1]=n[0],8
291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp4.ne		p6,p0=1,num
292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub		aptr=aptr,len	};;	// rewind
293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	sub		nptr=nptr,len
294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p41)	add		topbit=r0,r0,1
295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		tptr=16,sp	}
296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	add		tp_1=8,sp
297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		num=-1,num		// num--
298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	br.cond.sptk.many	.Louter	};;
299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mbb;	add		lc=4,lc
301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	clrrrb.pr			};;
303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	nop.m		0
304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		pr.rot=0x10001<<16
305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			// ------^---- (p33) at first (p17)
306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.lc=lc	}
307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	nop.m		0
308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.ec=3
309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	nop.i		0		};;
310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsub_ctop:
312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.pred.rel	"mutex",p33,p35
313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.f		0
315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.f		0
318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	nop.b		0			}
322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	(p18)	nop.m		0
323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ctop.sptk	.Lsub_ctop			};;
325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsub_cend:
326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	.pred.rel	"mutex",p34,p36
328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p34)	sub	topbit=topbit,r0	// (p19)
329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p36)	sub	topbit=topbit,r0,1
330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					}
332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	sub	rptr=rptr,len		// rewind
333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	tptr=tptr,len
334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	clrrrb.pr			};;
335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	and	aptr=tptr,topbit
336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	andcm	bptr=rptr,topbit
337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	pr.rot=1<<16		};;
338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	or	nptr=aptr,bptr
339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	ar.lc=lc
340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	ar.ec=3			};;
341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy_ctop:
343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p16)	ld8	n[0]=[nptr],8
344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	st8	[tptr]=r0,8
345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.b	0		}
346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p16)	nop.m	0
347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	st8	[rptr]=n[2],8
348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ctop.sptk	.Lcopy_ctop	};;
349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy_cend:
350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	mov		ret0=1			// signal "handled"
352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	rum		1<<5			// clear um.mfh
353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.lc=prevlc	}
354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	.restore	sp
355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		sp=prevsp
356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		pr=prevpr,0x1ffff
357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ret.sptk.many	b0	};;
358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp	bn_mul_mont_general#
359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstroma1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromn1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromt0=r15;
363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.skip	48		// aligns loop body
369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.local	bn_mul_mont_8#
370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc	bn_mul_mont_8#
371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul_mont_8:
372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.prologue
373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.save		ar.pfs,prevfs
374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	alloc		prevfs=ar.pfs,6,2,0,8
375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.vframe		prevsp
376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		prevsp=sp
377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save		ar.lc,prevlc
378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		prevlc=ar.lc	}
379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	add		r17=-6*16,sp
380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		sp=-7*16,sp
381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save		pr,prevpr
382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		prevpr=pr	};;
383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.save.gf	0,0x10
385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf.spill	[sp]=f16,-16
386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save.gf	0,0x20
387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf.spill	[r17]=f17,32
388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		r16=-5*16,prevsp};;
389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.save.gf	0,0x40
390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf.spill	[r16]=f18,32
391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save.gf	0,0x80
392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf.spill	[r17]=f19,32
393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		aptr=0,in1	};;
394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.save.gf	0,0x100
395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf.spill	[r16]=f20,32
396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save.gf	0,0x200
397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf.spill	[r17]=f21,32
398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		r29=8,in1	};;
399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.save.gf	0,0x400
400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf.spill	[r16]=f22
401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save.gf	0,0x800
402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf.spill	[r17]=f23
403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		rptr=0,in0	};;
404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.body
406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rotr		t[8]
408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// load input vectors padding them to 8 elements
410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8		ai1=[r29],16		// ap[1]
412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		bptr=0,in2	}
413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	$ADDP		r30=8,in2
414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		nptr=0,in3
415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		r31=8,in3	};;
416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8		bj[6]=[r30],16		// bp[1]
418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp4.le		p4,p5=3,in5	}
419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8		ni1=[r31],16		// np[1]
421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp4.le		p6,p7=4,in5	};;
422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p5)fcvt.fxu	ai2=f0
425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp4.le		p8,p9=5,in5	}
426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p7)fcvt.fxu	ai3=f0
428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp4.le		p10,p11=6,in5	}
429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p5)fcvt.fxu	bj[5]=f0
431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp4.le		p12,p13=7,in5	}
432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p7)fcvt.fxu	bj[4]=f0
434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp4.le		p14,p15=8,in5	}
435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p5)fcvt.fxu	ni2=f0
437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	addp4		r28=-1,in5	}
438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p7)fcvt.fxu	ni3=f0
440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP		in4=0,in4	};;
441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ldf8		n0=[in4]
443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	fcvt.fxu	tf[1]=f0
444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	nop.i		0		}
445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p9)fcvt.fxu	ai4=f0
448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		t[0]=r0		}
449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p11)fcvt.fxu	ai5=f0
451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		t[1]=r0		}
452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p9)fcvt.fxu	bj[3]=f0
454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		t[2]=r0		}
455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p11)fcvt.fxu	bj[2]=f0
457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		t[3]=r0		}
458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p9)fcvt.fxu	ni4=f0
460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		t[4]=r0		}
461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p11)fcvt.fxu	ni5=f0
463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		t[5]=r0		};;
464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p13)fcvt.fxu	ai6=f0
467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		t[6]=r0		}
468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p15)fcvt.fxu	ai7=f0
470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		t[7]=r0		}
471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p13)fcvt.fxu	bj[1]=f0
473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.lc=r28	}
474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p15)fcvt.fxu	bj[0]=f0
476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.ec=1		}
477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p13)fcvt.fxu	ni6=f0
479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		pr.rot=1<<16	}
480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p15)fcvt.fxu	ni7=f0
482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					};;
484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// to measure with help of Interval Time Counter indicated that the
487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// addressing the issue is problematic, because I don't have access
489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// to platform-specific instruction-level profiler. On Itanium it
490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// should run in 56*n ticks, because of higher xma latency...
491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter_8_ctop:
492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p40,p42
493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p48,p50
494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 0:
495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	add		a3=a3,n3,1
498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	add		t[6]=t[6],a3,1	};;
503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p43,p41=a3,n3	}
506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	cmp.ltu		p51,p49=t[6],a3
511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	cmp.leu		p51,p49=t[6],a3	};;
512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p41,p43
513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p49,p51
514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 4:
515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p43)	add		a4=a4,n4,1
518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	add		t[5]=t[5],a4,1	};;
523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 6:
524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	cmp.ltu		p42,p40=a4,n4	}
526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	cmp.ltu		p50,p48=t[5],a4
531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	cmp.leu		p50,p48=t[5],a4	};;
532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p40,p42
533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p48,p50
534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 8:
535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	add		a5=a5,n5,1
538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	add		t[4]=t[4],a5,1	};;
543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 10:
544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p43,p41=a5,n5	}
546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	cmp.ltu		p51,p49=t[4],a5
551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	cmp.leu		p51,p49=t[4],a5	};;
552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p41,p43
553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p49,p51
554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p43)	add		a6=a6,n6,1
558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	add		t[3]=t[3],a6,1	};;
563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 14:
564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	cmp.ltu		p42,p40=a6,n6	}
566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	nop.m		0			// 15:
570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	cmp.ltu		p50,p48=t[3],a6
571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	cmp.leu		p50,p48=t[3],a6	};;
572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p40,p42
573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p48,p50
574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 16:
575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	add		a7=a7,n7,1
578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	add		t[2]=t[2],a7,1	};;
583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 18:
584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p43,p41=a7,n7	}
586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	cmp.ltu		p51,p49=t[2],a7
591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	cmp.leu		p51,p49=t[2],a7	};;
592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p41,p43
593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p49,p51
594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 20:
595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p43)	add		a8=a8,n8,1
598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	add		t[1]=t[1],a8,1	};;
603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 22:
604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	cmp.ltu		p42,p40=a8,n8	}
606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	cmp.ltu		p50,p48=t[1],a8
611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	cmp.leu		p50,p48=t[1],a8	};;
612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 24:
613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0
616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	mov		t[0]=r0		};;
618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	add		t[0]=t[0],r0,1	};;
621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	add		t[0]=t[0],r0,1	}
624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	cmp.ltu.unc	p50,p48=t0,a1
629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p40,p42
631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p48,p50
632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 28:
633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	add		a2=a2,n2,1
636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	add		t[6]=t[6],a2,1	};;
641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	nop.m		0			// 30:
642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p41,p39=a2,n2	}
644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.i		0		};;
647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.f		0
649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	cmp.ltu		p49,p47=t[6],a2	}
650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p16)	nop.f		0
652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ctop.sptk.many	.Louter_8_ctop	};;
653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter_8_cend:
654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// above loop has to execute one more time, without (p16), which is
656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// replaced with merged move of np[8] to GPR bank
657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p40,p42
658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p48,p50
659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p0)	getf.sig	n1=ni0			// 0:
660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	add		a3=a3,n3,1	};;
662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	add		t[6]=t[6],a3,1	};;
665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p43,p41=a3,n3	}
668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p0)	nop.i		0		};;
671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	cmp.ltu		p51,p49=t[6],a3
673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	cmp.leu		p51,p49=t[6],a3	};;
674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p41,p43
675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p49,p51
676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p0)	getf.sig	n2=ni1			// 4:
677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p43)	add		a4=a4,n4,1	};;
679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p0)	nop.f		0
681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	add		t[5]=t[5],a4,1	};;
682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p0)	getf.sig	n3=ni2			// 6:
683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	cmp.ltu		p42,p40=a4,n4	}
685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p0)	nop.i		0		};;
688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	cmp.ltu		p50,p48=t[5],a4
690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	cmp.leu		p50,p48=t[5],a4	};;
691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p40,p42
692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p48,p50
693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	getf.sig	n4=ni3			// 8:
694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	add		a5=a5,n5,1	};;
696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	nop.m		0			// 9:
697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	add		t[4]=t[4],a5,1	};;
699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	nop.m		0			// 10:
700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p43,p41=a5,n5
701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	cmp.leu		p43,p41=a5,n5	};;
702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	cmp.ltu		p51,p49=t[4],a5
704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	cmp.leu		p51,p49=t[4],a5	};;
705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p41,p43
706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p49,p51
707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p43)	add		a6=a6,n6,1	};;
710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	getf.sig	n5=ni4			// 13:
711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	add		t[3]=t[3],a6,1	};;
713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	nop.m		0			// 14:
714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	cmp.ltu		p42,p40=a6,n6
715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p43)	cmp.leu		p42,p40=a6,n6	};;
716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	getf.sig	n6=ni5			// 15:
717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	cmp.ltu		p50,p48=t[3],a6
718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	cmp.leu		p50,p48=t[3],a6	};;
719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p40,p42
720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p48,p50
721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	nop.m		0			// 16:
722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	add		a7=a7,n7,1	};;
724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	nop.m		0			// 17:
725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	add		t[2]=t[2],a7,1	};;
727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	nop.m		0			// 18:
728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p40)	cmp.ltu		p43,p41=a7,n7
729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p42)	cmp.leu		p43,p41=a7,n7	};;
730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	getf.sig	n7=ni6			// 19:
731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p48)	cmp.ltu		p51,p49=t[2],a7
732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p50)	cmp.leu		p51,p49=t[2],a7	};;
733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p41,p43
734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel		"mutex",p49,p51
735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p0)	nop.m		0			// 20:
736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p43)	add		a8=a8,n8,1	};;
738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p0)	nop.m		0			// 21:
739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	add		t[1]=t[1],a8,1	}
741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p17)	mov		t[0]=r0
742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p41)	cmp.ltu		p42,p40=a8,n8
743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p43)	cmp.leu		p42,p40=a8,n8	};;
744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p0)	getf.sig	n8=ni7			// 22:
745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p49)	cmp.ltu		p50,p48=t[1],a8
746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p51)	cmp.leu		p50,p48=t[1],a8	}
747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p42)	add		t[0]=t[0],r0,1
748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p0)	add		r16=-7*16,prevsp
749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p0)	add		r17=-6*16,prevsp	};;
750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// subtract np[8] from carrybit|tmp[8]
752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// carrybit|tmp[8] layout upon exit from above loop is:
753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom//	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p50)add	t[0]=t[0],r0,1
755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		r18=-5*16,prevsp
756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub		n1=t0,n1	};;
757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p32,p34
759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p32)sub	n2=t[7],n2
760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)sub	n2=t[7],n2,1	};;
761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)cmp.geu	p35,p33=n2,t[7];;
763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p33,p35
764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p33)sub	n3=t[6],n3	}
765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p35)sub	n3=t[6],n3,1;;
766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p33)cmp.gtu	p34,p32=n3,t[6]
767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p35)cmp.geu	p34,p32=n3,t[6]	};;
768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p32,p34
769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p32)sub	n4=t[5],n4
770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)sub	n4=t[5],n4,1;;
771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p32)cmp.gtu	p35,p33=n4,t[5]	}
772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p33,p35
774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p33)sub	n5=t[4],n5
775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p35)sub	n5=t[4],n5,1	};;
776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p35)cmp.geu	p34,p32=n5,t[4];;
778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p32,p34
779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p32)sub	n6=t[3],n6	}
780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p34)sub	n6=t[3],n6,1;;
781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p32)cmp.gtu	p35,p33=n6,t[3]
782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)cmp.geu	p35,p33=n6,t[3]	};;
783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p33,p35
784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p33)sub	n7=t[2],n7
785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p35)sub	n7=t[2],n7,1;;
786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p33)cmp.gtu	p34,p32=n7,t[2]	}
787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p32,p34
789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p32)sub	n8=t[1],n8
790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)sub	n8=t[1],n8,1	};;
791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)cmp.geu	p35,p33=n8,t[1];;
793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p33,p35
794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p33)sub	a8=t[0],r0	}
795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p35)sub	a8=t[0],r0,1;;
796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p33)cmp.gtu	p34,p32=a8,t[0]
797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p35)cmp.geu	p34,p32=a8,t[0]	};;
798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom// save the result, either tmp[num] or tmp[num]-np[num]
800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.pred.rel	"mutex",p32,p34
801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p32)st8	[rptr]=n1,8
802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)st8	[rptr]=t0,8
803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add		r19=-4*16,prevsp};;
804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p32)st8	[rptr]=n2,8
805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)st8	[rptr]=t[7],8
806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p5)br.cond.dpnt.few	.Ldone	};;
807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p32)st8	[rptr]=n3,8
808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)st8	[rptr]=t[6],8
809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p7)br.cond.dpnt.few	.Ldone	};;
810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p32)st8	[rptr]=n4,8
811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)st8	[rptr]=t[5],8
812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p9)br.cond.dpnt.few	.Ldone	};;
813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p32)st8	[rptr]=n5,8
814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)st8	[rptr]=t[4],8
815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p11)br.cond.dpnt.few	.Ldone	};;
816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p32)st8	[rptr]=n6,8
817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)st8	[rptr]=t[3],8
818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p13)br.cond.dpnt.few	.Ldone	};;
819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p32)st8	[rptr]=n7,8
820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)st8	[rptr]=t[2],8
821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p15)br.cond.dpnt.few	.Ldone	};;
822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmb;	(p32)st8	[rptr]=n8,8
823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p34)st8	[rptr]=t[1],8
824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	nop.b		0		};;
825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Ldone:						// epilogue
826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf.fill	f16=[r16],64
827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf.fill	f17=[r17],64
828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	nop.i		0		}
829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf.fill	f18=[r18],64
830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf.fill	f19=[r19],64
831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		pr=prevpr,0x1ffff	};;
832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf.fill	f20=[r16]
833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf.fill	f21=[r17]
834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ar.lc=prevlc	}
835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf.fill	f22=[r18]
836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf.fill	f23=[r19]
837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		ret0=1		}	// signal "handled"
838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	rum		1<<5
839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.restore	sp
840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov		sp=prevsp
841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ret.sptk.many	b0	};;
842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp	bn_mul_mont_8#
843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	copyright#,\@object
845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromcopyright:
846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromstringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$output=shift and open STDOUT,">$output";
850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprint $code;
851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT;
852