sha1-x86_64.pl revision bdfb8ad83da0647e9b9a32792598e8ce7ba3ef4d
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env perl 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# ==================================================================== 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)# project. The module is, however, dual licensed under OpenSSL and 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# CRYPTOGAMS licenses depending on where you obtain it. For further 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# details see http://www.openssl.org/~appro/cryptogams/. 8eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch# ==================================================================== 990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)# 102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)# sha1_block procedure for x86_64. 112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)# 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# It was brought to my attention that on EM64T compiler-generated code 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# was far behind 32-bit assembler implementation. This is unlike on 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Opteron where compiler-generated code was only 15% behind 32-bit 15868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)# assembler, which originally made it hard to motivate the effort. 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# There was suggestion to mechanically translate 32-bit code, but I 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# dismissed it, reasoning that x86_64 offers enough register bank 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# implementation:-) However! While 64-bit code does performs better 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# x86_64 does offer larger *addressable* bank, but out-of-order core 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# reaches for even more registers through dynamic aliasing, and EM64T 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# core must have managed to run-time optimize even 32-bit code just as 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# good as 64-bit one. Performance improvement is summarized in the 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# following table: 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# gcc 3.4 32-bit asm cycles/byte 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Opteron +45% +20% 6.8 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Xeon P4 +65% +0% 9.9 30eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch# Core2 +60% +10% 7.0 31eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$output=shift; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 37eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdochdie "can't locate x86_64-xlate.pl"; 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)open STDOUT,"| $^X $xlate $output"; 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$ctx="%rdi"; # 1st arg 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$inp="%rsi"; # 2nd arg 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$num="%rdx"; # 3rd arg 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# reassign arguments in order to produce more compact code 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$ctx="%r8"; 47c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch$inp="%r9"; 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$num="%r10"; 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 50c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch$xi="%eax"; 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$t0="%ebx"; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$t1="%ecx"; 53c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)$A="%edx"; 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$B="%esi"; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$C="%edi"; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$D="%ebp"; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$E="%r11d"; 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$T="%r12d"; 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)@V=($A,$B,$C,$D,$E,$T); 61eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch 62c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochsub PROLOGUE { 63c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochmy $func=shift; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___; 65eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch.globl $func 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).type $func,\@function,3 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).align 16 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$func: 69eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch push %rbx 70eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch push %rbp 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) push %r12 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov %rsp,%rax 73c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch mov %rdi,$ctx # reassigned argument 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub \$`8+16*4`,%rsp 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov %rsi,$inp # reassigned argument 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) and \$-64,%rsp 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov %rdx,$num # reassigned argument 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov %rax,`16*4`(%rsp) 792a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov 0($ctx),$A 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov 4($ctx),$B 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov 8($ctx),$C 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov 12($ctx),$D 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov 16($ctx),$E 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sub EPILOGUE { 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my $func=shift; 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___; 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov `16*4`(%rsp),%rsp 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pop %r12 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pop %rbp 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pop %rbx 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ret 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).size $func,.-$func 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sub BODY_00_19 { 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my $j=$i+1; 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i==0); 104eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch mov `4*$i`($inp),$xi 105eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch `"bswap $xi" if(!defined($host))` 106eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch mov $xi,`4*$i`(%rsp) 107eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch___ 108eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch$code.=<<___ if ($i<15); 109eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch lea 0x5a827999($xi,$e),$f 1105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) mov $c,$t0 111eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch mov `4*$j`($inp),$xi 112eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch mov $a,$e 113eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch xor $d,$t0 114eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch `"bswap $xi" if(!defined($host))` 115eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch rol \$5,$e 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) and $b,$t0 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov $xi,`4*$j`(%rsp) 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $e,$f 119c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch xor $d,$t0 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rol \$30,$b 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $t0,$f 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i>=15); 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lea 0x5a827999($xi,$e),$f 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov `4*($j%16)`(%rsp),$xi 1262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) mov $c,$t0 1272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) mov $a,$e 1282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) xor `4*(($j+2)%16)`(%rsp),$xi 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor $d,$t0 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rol \$5,$e 1312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) xor `4*(($j+8)%16)`(%rsp),$xi 1322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) and $b,$t0 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $e,$f 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor `4*(($j+13)%16)`(%rsp),$xi 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor $d,$t0 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rol \$30,$b 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $t0,$f 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rol \$1,$xi 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov $xi,`4*($j%16)`(%rsp) 140eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch___ 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1423551c9c881056c480085172ff9840cab31610854Torne (Richard Coles) 1433551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)sub BODY_20_39 { 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my ($i,$a,$b,$c,$d,$e,$f)=@_; 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my $j=$i+1; 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i<79); 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lea $K($xi,$e),$f 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov `4*($j%16)`(%rsp),$xi 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov $c,$t0 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov $a,$e 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor `4*(($j+2)%16)`(%rsp),$xi 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor $b,$t0 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rol \$5,$e 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor `4*(($j+8)%16)`(%rsp),$xi 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor $d,$t0 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $e,$f 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor `4*(($j+13)%16)`(%rsp),$xi 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rol \$30,$b 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $t0,$f 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rol \$1,$xi 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i<76); 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov $xi,`4*($j%16)`(%rsp) 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i==79); 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lea $K($xi,$e),$f 168 mov $c,$t0 169 mov $a,$e 170 xor $b,$t0 171 rol \$5,$e 172 xor $d,$t0 173 add $e,$f 174 rol \$30,$b 175 add $t0,$f 176___ 177} 178 179sub BODY_40_59 { 180my ($i,$a,$b,$c,$d,$e,$f)=@_; 181my $j=$i+1; 182$code.=<<___; 183 lea 0x8f1bbcdc($xi,$e),$f 184 mov `4*($j%16)`(%rsp),$xi 185 mov $b,$t0 186 mov $b,$t1 187 xor `4*(($j+2)%16)`(%rsp),$xi 188 mov $a,$e 189 and $c,$t0 190 xor `4*(($j+8)%16)`(%rsp),$xi 191 or $c,$t1 192 rol \$5,$e 193 xor `4*(($j+13)%16)`(%rsp),$xi 194 and $d,$t1 195 add $e,$f 196 rol \$1,$xi 197 or $t1,$t0 198 rol \$30,$b 199 mov $xi,`4*($j%16)`(%rsp) 200 add $t0,$f 201___ 202} 203 204$code=".text\n"; 205 206&PROLOGUE("sha1_block_data_order"); 207$code.=".align 4\n.Lloop:\n"; 208for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 209for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 210for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 211for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 212$code.=<<___; 213 add 0($ctx),$E 214 add 4($ctx),$T 215 add 8($ctx),$A 216 add 12($ctx),$B 217 add 16($ctx),$C 218 mov $E,0($ctx) 219 mov $T,4($ctx) 220 mov $A,8($ctx) 221 mov $B,12($ctx) 222 mov $C,16($ctx) 223 224 xchg $E,$A # mov $E,$A 225 xchg $T,$B # mov $T,$B 226 xchg $E,$C # mov $A,$C 227 xchg $T,$D # mov $B,$D 228 # mov $C,$E 229 lea `16*4`($inp),$inp 230 sub \$1,$num 231 jnz .Lloop 232___ 233&EPILOGUE("sha1_block_data_order"); 234$code.=<<___; 235.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 236___ 237 238#################################################################### 239 240$code =~ s/\`([^\`]*)\`/eval $1/gem; 241print $code; 242close STDOUT; 243