sha1-x86_64.pl revision e45f106cb6b47af1f21efe76e933bdea2f5dd1ca
1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# sha1_block procedure for x86_64. 11# 12# It was brought to my attention that on EM64T compiler-generated code 13# was far behind 32-bit assembler implementation. This is unlike on 14# Opteron where compiler-generated code was only 15% behind 32-bit 15# assembler, which originally made it hard to motivate the effort. 16# There was suggestion to mechanically translate 32-bit code, but I 17# dismissed it, reasoning that x86_64 offers enough register bank 18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19# implementation:-) However! While 64-bit code does performs better 20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21# x86_64 does offer larger *addressable* bank, but out-of-order core 22# reaches for even more registers through dynamic aliasing, and EM64T 23# core must have managed to run-time optimize even 32-bit code just as 24# good as 64-bit one. Performance improvement is summarized in the 25# following table: 26# 27# gcc 3.4 32-bit asm cycles/byte 28# Opteron +45% +20% 6.8 29# Xeon P4 +65% +0% 9.9 30# Core2 +60% +10% 7.0 31 32$output=shift; 33 34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 37die "can't locate x86_64-xlate.pl"; 38 39open STDOUT,"| $^X $xlate $output"; 40 41$ctx="%rdi"; # 1st arg 42$inp="%rsi"; # 2nd arg 43$num="%rdx"; # 3rd arg 44 45# reassign arguments in order to produce more compact code 46$ctx="%r8"; 47$inp="%r9"; 48$num="%r10"; 49 50$xi="%eax"; 51$t0="%ebx"; 52$t1="%ecx"; 53$A="%edx"; 54$B="%esi"; 55$C="%edi"; 56$D="%ebp"; 57$E="%r11d"; 58$T="%r12d"; 59 60@V=($A,$B,$C,$D,$E,$T); 61 62sub PROLOGUE { 63my $func=shift; 64$code.=<<___; 65.globl $func 66.type $func,\@function,3 67.align 16 68$func: 69 push %rbx 70 push %rbp 71 push %r12 72 mov %rsp,%rax 73 mov %rdi,$ctx # reassigned argument 74 sub \$`8+16*4`,%rsp 75 mov %rsi,$inp # reassigned argument 76 and \$-64,%rsp 77 mov %rdx,$num # reassigned argument 78 mov %rax,`16*4`(%rsp) 79 80 mov 0($ctx),$A 81 mov 4($ctx),$B 82 mov 8($ctx),$C 83 mov 12($ctx),$D 84 mov 16($ctx),$E 85___ 86} 87 88sub EPILOGUE { 89my $func=shift; 90$code.=<<___; 91 mov `16*4`(%rsp),%rsp 92 pop %r12 93 pop %rbp 94 pop %rbx 95 ret 96.size $func,.-$func 97___ 98} 99 100sub BODY_00_19 { 101my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; 102my $j=$i+1; 103$code.=<<___ if ($i==0); 104 mov `4*$i`($inp),$xi 105 `"bswap $xi" if(!defined($host))` 106 mov $xi,`4*$i`(%rsp) 107___ 108$code.=<<___ if ($i<15); 109 lea 0x5a827999($xi,$e),$f 110 mov $c,$t0 111 mov `4*$j`($inp),$xi 112 mov $a,$e 113 xor $d,$t0 114 `"bswap $xi" if(!defined($host))` 115 rol \$5,$e 116 and $b,$t0 117 mov $xi,`4*$j`(%rsp) 118 add $e,$f 119 xor $d,$t0 120 rol \$30,$b 121 add $t0,$f 122___ 123$code.=<<___ if ($i>=15); 124 lea 0x5a827999($xi,$e),$f 125 mov `4*($j%16)`(%rsp),$xi 126 mov $c,$t0 127 mov $a,$e 128 xor `4*(($j+2)%16)`(%rsp),$xi 129 xor $d,$t0 130 rol \$5,$e 131 xor `4*(($j+8)%16)`(%rsp),$xi 132 and $b,$t0 133 add $e,$f 134 xor `4*(($j+13)%16)`(%rsp),$xi 135 xor $d,$t0 136 rol \$30,$b 137 add $t0,$f 138 rol \$1,$xi 139 mov $xi,`4*($j%16)`(%rsp) 140___ 141} 142 143sub BODY_20_39 { 144my ($i,$a,$b,$c,$d,$e,$f)=@_; 145my $j=$i+1; 146my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 147$code.=<<___ if ($i<79); 148 lea $K($xi,$e),$f 149 mov `4*($j%16)`(%rsp),$xi 150 mov $c,$t0 151 mov $a,$e 152 xor `4*(($j+2)%16)`(%rsp),$xi 153 xor $b,$t0 154 rol \$5,$e 155 xor `4*(($j+8)%16)`(%rsp),$xi 156 xor $d,$t0 157 add $e,$f 158 xor `4*(($j+13)%16)`(%rsp),$xi 159 rol \$30,$b 160 add $t0,$f 161 rol \$1,$xi 162___ 163$code.=<<___ if ($i<76); 164 mov $xi,`4*($j%16)`(%rsp) 165___ 166$code.=<<___ if ($i==79); 167 lea $K($xi,$e),$f 168 mov $c,$t0 169 mov $a,$e 170 xor $b,$t0 171 rol \$5,$e 172 xor $d,$t0 173 add $e,$f 174 rol \$30,$b 175 add $t0,$f 176___ 177} 178 179sub BODY_40_59 { 180my ($i,$a,$b,$c,$d,$e,$f)=@_; 181my $j=$i+1; 182$code.=<<___; 183 lea 0x8f1bbcdc($xi,$e),$f 184 mov `4*($j%16)`(%rsp),$xi 185 mov $b,$t0 186 mov $b,$t1 187 xor `4*(($j+2)%16)`(%rsp),$xi 188 mov $a,$e 189 and $c,$t0 190 xor `4*(($j+8)%16)`(%rsp),$xi 191 or $c,$t1 192 rol \$5,$e 193 xor `4*(($j+13)%16)`(%rsp),$xi 194 and $d,$t1 195 add $e,$f 196 rol \$1,$xi 197 or $t1,$t0 198 rol \$30,$b 199 mov $xi,`4*($j%16)`(%rsp) 200 add $t0,$f 201___ 202} 203 204$code=".text\n"; 205 206&PROLOGUE("sha1_block_data_order"); 207$code.=".align 4\n.Lloop:\n"; 208for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 209for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 210for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 211for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 212$code.=<<___; 213 add 0($ctx),$E 214 add 4($ctx),$T 215 add 8($ctx),$A 216 add 12($ctx),$B 217 add 16($ctx),$C 218 mov $E,0($ctx) 219 mov $T,4($ctx) 220 mov $A,8($ctx) 221 mov $B,12($ctx) 222 mov $C,16($ctx) 223 224 xchg $E,$A # mov $E,$A 225 xchg $T,$B # mov $T,$B 226 xchg $E,$C # mov $A,$C 227 xchg $T,$D # mov $B,$D 228 # mov $C,$E 229 lea `16*4`($inp),$inp 230 sub \$1,$num 231 jnz .Lloop 232___ 233&EPILOGUE("sha1_block_data_order"); 234$code.=<<___; 235.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 236___ 237 238#################################################################### 239 240$code =~ s/\`([^\`]*)\`/eval $1/gem; 241print $code; 242close STDOUT; 243