sha1-s390x.pl revision 221304ee937bc0910948a8be1320cb8cc4eb6d36
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for s390x.
11
12# April 2007.
13#
14# Performance is >30% better than gcc 3.3 generated code. But the real
15# twist is that SHA1 hardware support is detected and utilized. In
16# which case performance can reach further >4.5x for larger chunks.
17
18# January 2009.
19#
20# Optimize Xupdate for amount of memory references and reschedule
21# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22# "only" ~2.3x faster than software.
23
24$kimdfunc=1;	# magic function code for kimd instruction
25
26$output=shift;
27open STDOUT,">$output";
28
29$K_00_39="%r0"; $K=$K_00_39;
30$K_40_79="%r1";
31$ctx="%r2";	$prefetch="%r2";
32$inp="%r3";
33$len="%r4";
34
35$A="%r5";
36$B="%r6";
37$C="%r7";
38$D="%r8";
39$E="%r9";	@V=($A,$B,$C,$D,$E);
40$t0="%r10";
41$t1="%r11";
42@X=("%r12","%r13","%r14");
43$sp="%r15";
44
45$frame=160+16*4;
46
47sub Xupdate {
48my $i=shift;
49
50$code.=<<___ if ($i==15);
51	lg	$prefetch,160($sp)	### Xupdate(16) warm-up
52	lr	$X[0],$X[2]
53___
54return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
55$code.=<<___ if ($i<16);
56	lg	$X[0],`$i*4`($inp)	### Xload($i)
57	rllg	$X[1],$X[0],32
58___
59$code.=<<___ if ($i>=16);
60	xgr	$X[0],$prefetch		### Xupdate($i)
61	lg	$prefetch,`160+4*(($i+2)%16)`($sp)
62	xg	$X[0],`160+4*(($i+8)%16)`($sp)
63	xgr	$X[0],$prefetch
64	rll	$X[0],$X[0],1
65	rllg	$X[1],$X[0],32
66	rll	$X[1],$X[1],1
67	rllg	$X[0],$X[1],32
68	lr	$X[2],$X[1]		# feedback
69___
70$code.=<<___ if ($i<=70);
71	stg	$X[0],`160+4*($i%16)`($sp)
72___
73unshift(@X,pop(@X));
74}
75
76sub BODY_00_19 {
77my ($i,$a,$b,$c,$d,$e)=@_;
78my $xi=$X[1];
79
80	&Xupdate($i);
81$code.=<<___;
82	alr	$e,$K		### $i
83	rll	$t1,$a,5
84	lr	$t0,$d
85	xr	$t0,$c
86	alr	$e,$t1
87	nr	$t0,$b
88	alr	$e,$xi
89	xr	$t0,$d
90	rll	$b,$b,30
91	alr	$e,$t0
92___
93}
94
95sub BODY_20_39 {
96my ($i,$a,$b,$c,$d,$e)=@_;
97my $xi=$X[1];
98
99	&Xupdate($i);
100$code.=<<___;
101	alr	$e,$K		### $i
102	rll	$t1,$a,5
103	lr	$t0,$b
104	alr	$e,$t1
105	xr	$t0,$c
106	alr	$e,$xi
107	xr	$t0,$d
108	rll	$b,$b,30
109	alr	$e,$t0
110___
111}
112
113sub BODY_40_59 {
114my ($i,$a,$b,$c,$d,$e)=@_;
115my $xi=$X[1];
116
117	&Xupdate($i);
118$code.=<<___;
119	alr	$e,$K		### $i
120	rll	$t1,$a,5
121	lr	$t0,$b
122	alr	$e,$t1
123	or	$t0,$c
124	lr	$t1,$b
125	nr	$t0,$d
126	nr	$t1,$c
127	alr	$e,$xi
128	or	$t0,$t1
129	rll	$b,$b,30
130	alr	$e,$t0
131___
132}
133
134$code.=<<___;
135.text
136.align	64
137.type	Ktable,\@object
138Ktable: .long	0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
139	.skip	48	#.long	0,0,0,0,0,0,0,0,0,0,0,0
140.size	Ktable,.-Ktable
141.globl	sha1_block_data_order
142.type	sha1_block_data_order,\@function
143sha1_block_data_order:
144___
145$code.=<<___ if ($kimdfunc);
146	larl	%r1,OPENSSL_s390xcap_P
147	lg	%r0,0(%r1)
148	tmhl	%r0,0x4000	# check for message-security assist
149	jz	.Lsoftware
150	lghi	%r0,0
151	la	%r1,16($sp)
152	.long	0xb93e0002	# kimd %r0,%r2
153	lg	%r0,16($sp)
154	tmhh	%r0,`0x8000>>$kimdfunc`
155	jz	.Lsoftware
156	lghi	%r0,$kimdfunc
157	lgr	%r1,$ctx
158	lgr	%r2,$inp
159	sllg	%r3,$len,6
160	.long	0xb93e0002	# kimd %r0,%r2
161	brc	1,.-4		# pay attention to "partial completion"
162	br	%r14
163.align	16
164.Lsoftware:
165___
166$code.=<<___;
167	lghi	%r1,-$frame
168	stg	$ctx,16($sp)
169	stmg	%r6,%r15,48($sp)
170	lgr	%r0,$sp
171	la	$sp,0(%r1,$sp)
172	stg	%r0,0($sp)
173
174	larl	$t0,Ktable
175	llgf	$A,0($ctx)
176	llgf	$B,4($ctx)
177	llgf	$C,8($ctx)
178	llgf	$D,12($ctx)
179	llgf	$E,16($ctx)
180
181	lg	$K_00_39,0($t0)
182	lg	$K_40_79,8($t0)
183
184.Lloop:
185	rllg	$K_00_39,$K_00_39,32
186___
187for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
188$code.=<<___;
189	rllg	$K_00_39,$K_00_39,32
190___
191for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
192$code.=<<___;	$K=$K_40_79;
193	rllg	$K_40_79,$K_40_79,32
194___
195for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
196$code.=<<___;
197	rllg	$K_40_79,$K_40_79,32
198___
199for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
200$code.=<<___;
201
202	lg	$ctx,`$frame+16`($sp)
203	la	$inp,64($inp)
204	al	$A,0($ctx)
205	al	$B,4($ctx)
206	al	$C,8($ctx)
207	al	$D,12($ctx)
208	al	$E,16($ctx)
209	st	$A,0($ctx)
210	st	$B,4($ctx)
211	st	$C,8($ctx)
212	st	$D,12($ctx)
213	st	$E,16($ctx)
214	brct	$len,.Lloop
215
216	lmg	%r6,%r15,`$frame+48`($sp)
217	br	%r14
218.size	sha1_block_data_order,.-sha1_block_data_order
219.string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
220.comm	OPENSSL_s390xcap_P,8,8
221___
222
223$code =~ s/\`([^\`]*)\`/eval $1/gem;
224
225print $code;
226close STDOUT;
227