1221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#!/usr/bin/env perl
2221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
3221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ====================================================================
4221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and
6221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further
7221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/.
8221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ====================================================================
9221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
10221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# April 2007.
11221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
12221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Performance improvement over vanilla C code varies from 85% to 45%
13221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# depending on key length and benchmark. Unfortunately in this context
14221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# these are not very impressive results [for code that utilizes "wide"
15221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 64x64=128-bit multiplication, which is not commonly available to C
16221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# programmers], at least hand-coded bn_asm.c replacement is known to
17221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# provide 30-40% better results for longest keys. Well, on a second
18221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# thought it's not very surprising, because z-CPUs are single-issue
19221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# and _strictly_ in-order execution, while bn_mul_mont is more or less
20221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# dependent on CPU ability to pipe-line instructions and have several
21221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# of them "in-flight" at the same time. I mean while other methods,
22221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# for example Karatsuba, aim to minimize amount of multiplications at
23221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# the cost of other operations increase, bn_mul_mont aim to neatly
24221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# "overlap" multiplications and the other operations [and on most
25221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# platforms even minimize the amount of the other operations, in
26221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# particular references to memory]. But it's possible to improve this
27221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# module performance by implementing dedicated squaring code-path and
28221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# possibly by unrolling loops...
29221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
30221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# January 2009.
31221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
32221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Reschedule to minimize/avoid Address Generation Interlock hazard,
33221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# make inner loops counter-based.
34221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# November 2010.
36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Adapt for -m31 build. If kernel supports what's called "highgprs"
38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# instructions and achieve "64-bit" performance even in 31-bit legacy
40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# application context. The feature is not specific to any particular
41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# processor, as long as it's "z-CPU". Latter implies that the code
42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# is achieved by swapping words after 64-bit loads, follow _dswap-s.
44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# On z990 it was measured to perform 2.6-2.2 times better than
45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# compiler-generated code, less for longer keys...
46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$flavour = shift;
48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($flavour =~ /3[12]/) {
50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$SIZE_T=4;
51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$g="";
52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} else {
53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$SIZE_T=8;
54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$g="g";
55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromopen STDOUT,">$output";
59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$stdframe=16*$SIZE_T+4*8;
61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
62221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$mn0="%r0";
63221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$num="%r1";
64221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
65221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# int bn_mul_mont(
66221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$rp="%r2";		# BN_ULONG *rp,
67221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$ap="%r3";		# const BN_ULONG *ap,
68221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$bp="%r4";		# const BN_ULONG *bp,
69221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$np="%r5";		# const BN_ULONG *np,
70221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$n0="%r6";		# const BN_ULONG *n0,
71221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#$num="160(%r15)"	# int num);
72221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
73221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$bi="%r2";	# zaps rp
74221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$j="%r7";
75221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
76221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$ahi="%r8";
77221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$alo="%r9";
78221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$nhi="%r10";
79221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$nlo="%r11";
80221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$AHI="%r12";
81221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NHI="%r13";
82221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$count="%r14";
83221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$sp="%r15";
84221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
85221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___;
86221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.text
87221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.globl	bn_mul_mont
88221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.type	bn_mul_mont,\@function
89221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrombn_mul_mont:
90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
92221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$bp,0($num,$bp)
93221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	st${g}	%r2,2*$SIZE_T($sp)
95221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
96221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	cghi	$num,16		#
97221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	%r2,0		#
98221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	blr	%r14		# if($num<16) return 0;
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($flavour =~ /3[12]/);
101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	tmll	$num,4
102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	bnzr	%r14		# if ($num&1) return 0;
103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($flavour !~ /3[12]/);
105ee7afb3c942c4eefef6ed06201eafaf8ec58e2e3Brian Carlstrom	cghi	$num,96		#
106ee7afb3c942c4eefef6ed06201eafaf8ec58e2e3Brian Carlstrom	bhr	%r14		# if($num>96) return 0;
107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stm${g}	%r3,%r15,3*$SIZE_T($sp)
110221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lghi	$rp,-$stdframe-8	# leave room for carry bit
112221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lcgr	$j,$num		# -$num
113221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lgr	%r0,$sp
114221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$rp,0($rp,$sp)
115221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$sp,0($j,$rp)	# alloca
116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	st${g}	%r0,0($sp)	# back chain
117221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
118221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	sra	$num,3		# restore $num
119221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$bp,0($j,$bp)	# restore $bp
120221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	ahi	$num,-1		# adjust $num for inner loop
121221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$n0,0($n0)	# pull n0
122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$n0
123221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
124221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$bi,0($bp)
125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$bi
126221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$alo,0($ap)
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$alo
128221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mlgr	$ahi,$bi	# ap[0]*bp[0]
129221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lgr	$AHI,$ahi
130221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
131221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lgr	$mn0,$alo	# "tp[0]"*n0
132221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	msgr	$mn0,$n0
133221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
134221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$nlo,0($np)	#
135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$nlo
136221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mlgr	$nhi,$mn0	# np[0]*m1
137221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$nlo,$alo	# +="tp[0]"
138221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$NHI,0
139221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$NHI,$nhi
140221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
141221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$j,8(%r0)	# j=1
142221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lr	$count,$num
143221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
144221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	16
145221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.L1st:
146221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$alo,0($j,$ap)
147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$alo
148221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mlgr	$ahi,$bi	# ap[j]*bp[0]
149221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$alo,$AHI
150221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$AHI,0
151221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$AHI,$ahi
152221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
153221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$nlo,0($j,$np)
154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$nlo
155221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mlgr	$nhi,$mn0	# np[j]*m1
156221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$nlo,$NHI
157221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$NHI,0
158221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$nhi,$NHI	# +="tp[j]"
159221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$nlo,$alo
160221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$NHI,$nhi
161221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
163221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$j,8($j)	# j++
164221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	brct	$count,.L1st
165221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
166221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$NHI,$AHI
167221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$AHI,0
168221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$AHI,$AHI	# upmost overflow bit
169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stg	$NHI,$stdframe-8($j,$sp)
170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stg	$AHI,$stdframe($j,$sp)
171221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$bp,8($bp)	# bp++
172221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
173221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.Louter:
174221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$bi,0($bp)	# bp[i]
175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$bi
176221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$alo,0($ap)
177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$alo
178221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mlgr	$ahi,$bi	# ap[0]*bp[i]
179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	alg	$alo,$stdframe($sp)	# +=tp[0]
180221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$AHI,0
181221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$AHI,$ahi
182221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
183221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lgr	$mn0,$alo
184221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	msgr	$mn0,$n0	# tp[0]*n0
185221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
186221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$nlo,0($np)	# np[0]
187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$nlo
188221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mlgr	$nhi,$mn0	# np[0]*m1
189221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$nlo,$alo	# +="tp[0]"
190221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$NHI,0
191221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$NHI,$nhi
192221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
193221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$j,8(%r0)	# j=1
194221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lr	$count,$num
195221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
196221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	16
197221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.Linner:
198221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$alo,0($j,$ap)
199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$alo
200221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mlgr	$ahi,$bi	# ap[j]*bp[i]
201221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$alo,$AHI
202221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$AHI,0
203221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$ahi,$AHI
204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	alg	$alo,$stdframe($j,$sp)# +=tp[j]
205221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$AHI,$ahi
206221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
207221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lg	$nlo,0($j,$np)
208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$nlo
209221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mlgr	$nhi,$mn0	# np[j]*m1
210221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$nlo,$NHI
211221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$NHI,0
212221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$nhi,$NHI
213221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$nlo,$alo	# +="tp[j]"
214221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$NHI,$nhi
215221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
217221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$j,8($j)	# j++
218221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	brct	$count,.Linner
219221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
220221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	algr	$NHI,$AHI
221221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$AHI,0
222221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$AHI,$AHI
223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
224221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$ahi,0
225221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	alcgr	$AHI,$ahi	# new upmost overflow bit
226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stg	$NHI,$stdframe-8($j,$sp)
227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stg	$AHI,$stdframe($j,$sp)
228221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
229221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$bp,8($bp)	# bp++
230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
231221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	jne	.Louter
232221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	la	$ap,$stdframe($sp)
235221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	ahi	$num,1		# restore $num, incidentally clears "borrow"
236221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
237221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$j,0(%r0)
238221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lr	$count,$num
239221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.Lsub:	lg	$alo,0($j,$ap)
240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lg	$nlo,0($j,$np)
241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$nlo
242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	slbgr	$alo,$nlo
243221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	stg	$alo,0($j,$rp)
244221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$j,8($j)
245221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	brct	$count,.Lsub
246221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$ahi,0
247221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	slbgr	$AHI,$ahi	# handle upmost carry
248221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
249221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	ngr	$ap,$AHI
250221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	$np,-1
251221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	xgr	$np,$AHI
252221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	ngr	$np,$rp
253221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	ogr	$ap,$np		# ap=borrow?tp:rp
254221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
255221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$j,0(%r0)
256221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lgr	$count,$num
257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	_dswap	$alo
259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stg	$j,$stdframe($j,$sp)	# zap tp
260221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	stg	$alo,0($j,$rp)
261221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	la	$j,8($j)
262221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	brct	$count,.Lcopy
263221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lm${g}	%r6,%r15,0(%r1)
266221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lghi	%r2,1		# signal "processed"
267221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	br	%r14
268221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.size	bn_mul_mont,.-bn_mul_mont
269221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
270221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
271221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromforeach (split("\n",$code)) {
273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	s/\`([^\`]*)\`/eval $1/ge;
274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	print $_,"\n";
276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
277221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromclose STDOUT;
278