sha1-s390x.pl revision 392aa7cc7d2b122614c5393c3e357da07fd07af3
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for s390x.
11
12# April 2007.
13#
14# Performance is >30% better than gcc 3.3 generated code. But the real
15# twist is that SHA1 hardware support is detected and utilized. In
16# which case performance can reach further >4.5x for larger chunks.
17
18# January 2009.
19#
20# Optimize Xupdate for amount of memory references and reschedule
21# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22# "only" ~2.3x faster than software.
23
24# November 2010.
25#
26# Adapt for -m31 build. If kernel supports what's called "highgprs"
27# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
28# instructions and achieve "64-bit" performance even in 31-bit legacy
29# application context. The feature is not specific to any particular
30# processor, as long as it's "z-CPU". Latter implies that the code
31# remains z/Architecture specific.
32
33$kimdfunc=1;	# magic function code for kimd instruction
34
35$flavour = shift;
36
37if ($flavour =~ /3[12]/) {
38	$SIZE_T=4;
39	$g="";
40} else {
41	$SIZE_T=8;
42	$g="g";
43}
44
45while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
46open STDOUT,">$output";
47
48$K_00_39="%r0"; $K=$K_00_39;
49$K_40_79="%r1";
50$ctx="%r2";	$prefetch="%r2";
51$inp="%r3";
52$len="%r4";
53
54$A="%r5";
55$B="%r6";
56$C="%r7";
57$D="%r8";
58$E="%r9";	@V=($A,$B,$C,$D,$E);
59$t0="%r10";
60$t1="%r11";
61@X=("%r12","%r13","%r14");
62$sp="%r15";
63
64$stdframe=16*$SIZE_T+4*8;
65$frame=$stdframe+16*4;
66
67sub Xupdate {
68my $i=shift;
69
70$code.=<<___ if ($i==15);
71	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
72	lr	$X[0],$X[2]
73___
74return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
75$code.=<<___ if ($i<16);
76	lg	$X[0],`$i*4`($inp)	### Xload($i)
77	rllg	$X[1],$X[0],32
78___
79$code.=<<___ if ($i>=16);
80	xgr	$X[0],$prefetch		### Xupdate($i)
81	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
82	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
83	xgr	$X[0],$prefetch
84	rll	$X[0],$X[0],1
85	rllg	$X[1],$X[0],32
86	rll	$X[1],$X[1],1
87	rllg	$X[0],$X[1],32
88	lr	$X[2],$X[1]		# feedback
89___
90$code.=<<___ if ($i<=70);
91	stg	$X[0],`$stdframe+4*($i%16)`($sp)
92___
93unshift(@X,pop(@X));
94}
95
96sub BODY_00_19 {
97my ($i,$a,$b,$c,$d,$e)=@_;
98my $xi=$X[1];
99
100	&Xupdate($i);
101$code.=<<___;
102	alr	$e,$K		### $i
103	rll	$t1,$a,5
104	lr	$t0,$d
105	xr	$t0,$c
106	alr	$e,$t1
107	nr	$t0,$b
108	alr	$e,$xi
109	xr	$t0,$d
110	rll	$b,$b,30
111	alr	$e,$t0
112___
113}
114
115sub BODY_20_39 {
116my ($i,$a,$b,$c,$d,$e)=@_;
117my $xi=$X[1];
118
119	&Xupdate($i);
120$code.=<<___;
121	alr	$e,$K		### $i
122	rll	$t1,$a,5
123	lr	$t0,$b
124	alr	$e,$t1
125	xr	$t0,$c
126	alr	$e,$xi
127	xr	$t0,$d
128	rll	$b,$b,30
129	alr	$e,$t0
130___
131}
132
133sub BODY_40_59 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi=$X[1];
136
137	&Xupdate($i);
138$code.=<<___;
139	alr	$e,$K		### $i
140	rll	$t1,$a,5
141	lr	$t0,$b
142	alr	$e,$t1
143	or	$t0,$c
144	lr	$t1,$b
145	nr	$t0,$d
146	nr	$t1,$c
147	alr	$e,$xi
148	or	$t0,$t1
149	rll	$b,$b,30
150	alr	$e,$t0
151___
152}
153
154$code.=<<___;
155.text
156.align	64
157.type	Ktable,\@object
158Ktable: .long	0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
159	.skip	48	#.long	0,0,0,0,0,0,0,0,0,0,0,0
160.size	Ktable,.-Ktable
161.globl	sha1_block_data_order
162.type	sha1_block_data_order,\@function
163sha1_block_data_order:
164___
165$code.=<<___ if ($kimdfunc);
166	larl	%r1,OPENSSL_s390xcap_P
167	lg	%r0,0(%r1)
168	tmhl	%r0,0x4000	# check for message-security assist
169	jz	.Lsoftware
170	lghi	%r0,0
171	la	%r1,`2*$SIZE_T`($sp)
172	.long	0xb93e0002	# kimd %r0,%r2
173	lg	%r0,`2*$SIZE_T`($sp)
174	tmhh	%r0,`0x8000>>$kimdfunc`
175	jz	.Lsoftware
176	lghi	%r0,$kimdfunc
177	lgr	%r1,$ctx
178	lgr	%r2,$inp
179	sllg	%r3,$len,6
180	.long	0xb93e0002	# kimd %r0,%r2
181	brc	1,.-4		# pay attention to "partial completion"
182	br	%r14
183.align	16
184.Lsoftware:
185___
186$code.=<<___;
187	lghi	%r1,-$frame
188	st${g}	$ctx,`2*$SIZE_T`($sp)
189	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
190	lgr	%r0,$sp
191	la	$sp,0(%r1,$sp)
192	st${g}	%r0,0($sp)
193
194	larl	$t0,Ktable
195	llgf	$A,0($ctx)
196	llgf	$B,4($ctx)
197	llgf	$C,8($ctx)
198	llgf	$D,12($ctx)
199	llgf	$E,16($ctx)
200
201	lg	$K_00_39,0($t0)
202	lg	$K_40_79,8($t0)
203
204.Lloop:
205	rllg	$K_00_39,$K_00_39,32
206___
207for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
208$code.=<<___;
209	rllg	$K_00_39,$K_00_39,32
210___
211for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212$code.=<<___;	$K=$K_40_79;
213	rllg	$K_40_79,$K_40_79,32
214___
215for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
216$code.=<<___;
217	rllg	$K_40_79,$K_40_79,32
218___
219for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
220$code.=<<___;
221
222	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
223	la	$inp,64($inp)
224	al	$A,0($ctx)
225	al	$B,4($ctx)
226	al	$C,8($ctx)
227	al	$D,12($ctx)
228	al	$E,16($ctx)
229	st	$A,0($ctx)
230	st	$B,4($ctx)
231	st	$C,8($ctx)
232	st	$D,12($ctx)
233	st	$E,16($ctx)
234	brct${g} $len,.Lloop
235
236	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
237	br	%r14
238.size	sha1_block_data_order,.-sha1_block_data_order
239.string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
240.comm	OPENSSL_s390xcap_P,16,8
241___
242
243$code =~ s/\`([^\`]*)\`/eval $1/gem;
244
245print $code;
246close STDOUT;
247