sha1-x86_64.pl revision bdfb8ad83da0647e9b9a32792598e8ce7ba3ef4d
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env perl
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# ====================================================================
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)# project. The module is, however, dual licensed under OpenSSL and
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# CRYPTOGAMS licenses depending on where you obtain it. For further
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# details see http://www.openssl.org/~appro/cryptogams/.
8eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch# ====================================================================
990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#
102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)# sha1_block procedure for x86_64.
112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# It was brought to my attention that on EM64T compiler-generated code
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# was far behind 32-bit assembler implementation. This is unlike on
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Opteron where compiler-generated code was only 15% behind 32-bit
15868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)# assembler, which originally made it hard to motivate the effort.
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# There was suggestion to mechanically translate 32-bit code, but I
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# dismissed it, reasoning that x86_64 offers enough register bank
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# implementation:-) However! While 64-bit code does performs better
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# x86_64 does offer larger *addressable* bank, but out-of-order core
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# reaches for even more registers through dynamic aliasing, and EM64T
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# core must have managed to run-time optimize even 32-bit code just as
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# good as 64-bit one. Performance improvement is summarized in the
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# following table:
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#		gcc 3.4		32-bit asm	cycles/byte
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Opteron	+45%		+20%		6.8
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Xeon P4	+65%		+0%		9.9
30eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch# Core2		+60%		+10%		7.0
31eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$output=shift;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdochdie "can't locate x86_64-xlate.pl";
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)open STDOUT,"| $^X $xlate $output";
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$ctx="%rdi";	# 1st arg
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$inp="%rsi";	# 2nd arg
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$num="%rdx";	# 3rd arg
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# reassign arguments in order to produce more compact code
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$ctx="%r8";
47c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch$inp="%r9";
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$num="%r10";
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
50c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch$xi="%eax";
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$t0="%ebx";
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$t1="%ecx";
53c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)$A="%edx";
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$B="%esi";
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$C="%edi";
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$D="%ebp";
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$E="%r11d";
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$T="%r12d";
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)@V=($A,$B,$C,$D,$E,$T);
61eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch
62c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochsub PROLOGUE {
63c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochmy $func=shift;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___;
65eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch.globl	$func
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).type	$func,\@function,3
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).align	16
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$func:
69eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	push	%rbx
70eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	push	%rbp
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	push	%r12
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	%rsp,%rax
73c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch	mov	%rdi,$ctx	# reassigned argument
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	sub	\$`8+16*4`,%rsp
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	%rsi,$inp	# reassigned argument
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	and	\$-64,%rsp
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	%rdx,$num	# reassigned argument
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	%rax,`16*4`(%rsp)
792a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	0($ctx),$A
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	4($ctx),$B
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	8($ctx),$C
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	12($ctx),$D
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	16($ctx),$E
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sub EPILOGUE {
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my $func=shift;
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___;
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	`16*4`(%rsp),%rsp
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	pop	%r12
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	pop	%rbp
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	pop	%rbx
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ret
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).size	$func,.-$func
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sub BODY_00_19 {
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my $j=$i+1;
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i==0);
104eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	mov	`4*$i`($inp),$xi
105eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	`"bswap	$xi"	if(!defined($host))`
106eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	mov	$xi,`4*$i`(%rsp)
107eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch___
108eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch$code.=<<___ if ($i<15);
109eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	lea	0x5a827999($xi,$e),$f
1105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)	mov	$c,$t0
111eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	mov	`4*$j`($inp),$xi
112eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	mov	$a,$e
113eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	xor	$d,$t0
114eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	`"bswap	$xi"	if(!defined($host))`
115eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch	rol	\$5,$e
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	and	$b,$t0
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	$xi,`4*$j`(%rsp)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$e,$f
119c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch	xor	$d,$t0
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	rol	\$30,$b
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$t0,$f
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i>=15);
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	lea	0x5a827999($xi,$e),$f
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	`4*($j%16)`(%rsp),$xi
1262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	mov	$c,$t0
1272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	mov	$a,$e
1282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	xor	`4*(($j+2)%16)`(%rsp),$xi
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	$d,$t0
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	rol	\$5,$e
1312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	xor	`4*(($j+8)%16)`(%rsp),$xi
1322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	and	$b,$t0
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$e,$f
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	`4*(($j+13)%16)`(%rsp),$xi
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	$d,$t0
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	rol	\$30,$b
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$t0,$f
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	rol	\$1,$xi
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	$xi,`4*($j%16)`(%rsp)
140eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch___
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1423551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)
1433551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)sub BODY_20_39 {
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my ($i,$a,$b,$c,$d,$e,$f)=@_;
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my $j=$i+1;
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i<79);
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	lea	$K($xi,$e),$f
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	`4*($j%16)`(%rsp),$xi
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	$c,$t0
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	$a,$e
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	`4*(($j+2)%16)`(%rsp),$xi
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	$b,$t0
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	rol	\$5,$e
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	`4*(($j+8)%16)`(%rsp),$xi
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	$d,$t0
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$e,$f
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	`4*(($j+13)%16)`(%rsp),$xi
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	rol	\$30,$b
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$t0,$f
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	rol	\$1,$xi
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i<76);
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	mov	$xi,`4*($j%16)`(%rsp)
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i==79);
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	lea	$K($xi,$e),$f
168	mov	$c,$t0
169	mov	$a,$e
170	xor	$b,$t0
171	rol	\$5,$e
172	xor	$d,$t0
173	add	$e,$f
174	rol	\$30,$b
175	add	$t0,$f
176___
177}
178
179sub BODY_40_59 {
180my ($i,$a,$b,$c,$d,$e,$f)=@_;
181my $j=$i+1;
182$code.=<<___;
183	lea	0x8f1bbcdc($xi,$e),$f
184	mov	`4*($j%16)`(%rsp),$xi
185	mov	$b,$t0
186	mov	$b,$t1
187	xor	`4*(($j+2)%16)`(%rsp),$xi
188	mov	$a,$e
189	and	$c,$t0
190	xor	`4*(($j+8)%16)`(%rsp),$xi
191	or	$c,$t1
192	rol	\$5,$e
193	xor	`4*(($j+13)%16)`(%rsp),$xi
194	and	$d,$t1
195	add	$e,$f
196	rol	\$1,$xi
197	or	$t1,$t0
198	rol	\$30,$b
199	mov	$xi,`4*($j%16)`(%rsp)
200	add	$t0,$f
201___
202}
203
204$code=".text\n";
205
206&PROLOGUE("sha1_block_data_order");
207$code.=".align	4\n.Lloop:\n";
208for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
209for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
210for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
211for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212$code.=<<___;
213	add	0($ctx),$E
214	add	4($ctx),$T
215	add	8($ctx),$A
216	add	12($ctx),$B
217	add	16($ctx),$C
218	mov	$E,0($ctx)
219	mov	$T,4($ctx)
220	mov	$A,8($ctx)
221	mov	$B,12($ctx)
222	mov	$C,16($ctx)
223
224	xchg	$E,$A	# mov	$E,$A
225	xchg	$T,$B	# mov	$T,$B
226	xchg	$E,$C	# mov	$A,$C
227	xchg	$T,$D	# mov	$B,$D
228			# mov	$C,$E
229	lea	`16*4`($inp),$inp
230	sub	\$1,$num
231	jnz	.Lloop
232___
233&EPILOGUE("sha1_block_data_order");
234$code.=<<___;
235.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
236___
237
238####################################################################
239
240$code =~ s/\`([^\`]*)\`/eval $1/gem;
241print $code;
242close STDOUT;
243