sha1-x86_64.pl revision f48372ded3bb76c2598392aa58abe6e2eb7432d2
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does performs better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32$output=shift;
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open STDOUT,"| $^X $xlate $output";
40
41$ctx="%rdi";	# 1st arg
42$inp="%rsi";	# 2nd arg
43$num="%rdx";	# 3rd arg
44
45# reassign arguments in order to produce more compact code
46$ctx="%r8";
47$inp="%r9";
48$num="%r10";
49
50$xi="%eax";
51$t0="%ebx";
52$t1="%ecx";
53$A="%edx";
54$B="%esi";
55$C="%edi";
56$D="%ebp";
57$E="%r11d";
58$T="%r12d";
59
60@V=($A,$B,$C,$D,$E,$T);
61
62sub PROLOGUE {
63my $func=shift;
64$code.=<<___;
65.globl	$func
66.type	$func,\@function,3
67.align	16
68$func:
69	push	%rbx
70	push	%rbp
71	push	%r12
72	mov	%rsp,%rax
73	mov	%rdi,$ctx	# reassigned argument
74	sub	\$`8+16*4`,%rsp
75	mov	%rsi,$inp	# reassigned argument
76	and	\$-64,%rsp
77	mov	%rdx,$num	# reassigned argument
78	mov	%rax,`16*4`(%rsp)
79
80	mov	0($ctx),$A
81	mov	4($ctx),$B
82	mov	8($ctx),$C
83	mov	12($ctx),$D
84	mov	16($ctx),$E
85___
86}
87
88sub EPILOGUE {
89my $func=shift;
90$code.=<<___;
91	mov	`16*4`(%rsp),%rsp
92	pop	%r12
93	pop	%rbp
94	pop	%rbx
95	ret
96.size	$func,.-$func
97___
98}
99
100sub BODY_00_19 {
101my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
102my $j=$i+1;
103$code.=<<___ if ($i==0);
104	mov	`4*$i`($inp),$xi
105	`"bswap	$xi"	if(!defined($host))`
106	mov	$xi,`4*$i`(%rsp)
107___
108$code.=<<___ if ($i<15);
109	lea	0x5a827999($xi,$e),$f
110	mov	$c,$t0
111	mov	`4*$j`($inp),$xi
112	mov	$a,$e
113	xor	$d,$t0
114	`"bswap	$xi"	if(!defined($host))`
115	rol	\$5,$e
116	and	$b,$t0
117	mov	$xi,`4*$j`(%rsp)
118	add	$e,$f
119	xor	$d,$t0
120	rol	\$30,$b
121	add	$t0,$f
122___
123$code.=<<___ if ($i>=15);
124	lea	0x5a827999($xi,$e),$f
125	mov	`4*($j%16)`(%rsp),$xi
126	mov	$c,$t0
127	mov	$a,$e
128	xor	`4*(($j+2)%16)`(%rsp),$xi
129	xor	$d,$t0
130	rol	\$5,$e
131	xor	`4*(($j+8)%16)`(%rsp),$xi
132	and	$b,$t0
133	add	$e,$f
134	xor	`4*(($j+13)%16)`(%rsp),$xi
135	xor	$d,$t0
136	rol	\$30,$b
137	add	$t0,$f
138	rol	\$1,$xi
139	mov	$xi,`4*($j%16)`(%rsp)
140___
141}
142
143sub BODY_20_39 {
144my ($i,$a,$b,$c,$d,$e,$f)=@_;
145my $j=$i+1;
146my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
147$code.=<<___ if ($i<79);
148	lea	$K($xi,$e),$f
149	mov	`4*($j%16)`(%rsp),$xi
150	mov	$c,$t0
151	mov	$a,$e
152	xor	`4*(($j+2)%16)`(%rsp),$xi
153	xor	$b,$t0
154	rol	\$5,$e
155	xor	`4*(($j+8)%16)`(%rsp),$xi
156	xor	$d,$t0
157	add	$e,$f
158	xor	`4*(($j+13)%16)`(%rsp),$xi
159	rol	\$30,$b
160	add	$t0,$f
161	rol	\$1,$xi
162___
163$code.=<<___ if ($i<76);
164	mov	$xi,`4*($j%16)`(%rsp)
165___
166$code.=<<___ if ($i==79);
167	lea	$K($xi,$e),$f
168	mov	$c,$t0
169	mov	$a,$e
170	xor	$b,$t0
171	rol	\$5,$e
172	xor	$d,$t0
173	add	$e,$f
174	rol	\$30,$b
175	add	$t0,$f
176___
177}
178
179sub BODY_40_59 {
180my ($i,$a,$b,$c,$d,$e,$f)=@_;
181my $j=$i+1;
182$code.=<<___;
183	lea	0x8f1bbcdc($xi,$e),$f
184	mov	`4*($j%16)`(%rsp),$xi
185	mov	$b,$t0
186	mov	$b,$t1
187	xor	`4*(($j+2)%16)`(%rsp),$xi
188	mov	$a,$e
189	and	$c,$t0
190	xor	`4*(($j+8)%16)`(%rsp),$xi
191	or	$c,$t1
192	rol	\$5,$e
193	xor	`4*(($j+13)%16)`(%rsp),$xi
194	and	$d,$t1
195	add	$e,$f
196	rol	\$1,$xi
197	or	$t1,$t0
198	rol	\$30,$b
199	mov	$xi,`4*($j%16)`(%rsp)
200	add	$t0,$f
201___
202}
203
204$code=".text\n";
205
206&PROLOGUE("sha1_block_data_order");
207$code.=".align	4\n.Lloop:\n";
208for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
209for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
210for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
211for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212$code.=<<___;
213	add	0($ctx),$E
214	add	4($ctx),$T
215	add	8($ctx),$A
216	add	12($ctx),$B
217	add	16($ctx),$C
218	mov	$E,0($ctx)
219	mov	$T,4($ctx)
220	mov	$A,8($ctx)
221	mov	$B,12($ctx)
222	mov	$C,16($ctx)
223
224	xchg	$E,$A	# mov	$E,$A
225	xchg	$T,$B	# mov	$T,$B
226	xchg	$E,$C	# mov	$A,$C
227	xchg	$T,$D	# mov	$B,$D
228			# mov	$C,$E
229	lea	`16*4`($inp),$inp
230	sub	\$1,$num
231	jnz	.Lloop
232___
233&EPILOGUE("sha1_block_data_order");
234$code.=<<___;
235.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
236___
237
238####################################################################
239
240$code =~ s/\`([^\`]*)\`/eval $1/gem;
241print $code;
242close STDOUT;
243