1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl 2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and 6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further 7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/. 8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# May 2011 11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The module implements bn_GF2m_mul_2x2 polynomial multiplication used 13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for 14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# the time being... Except that it has three code paths: pure integer 15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# code suitable for any x86 CPU, MMX code suitable for PIII and later 16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# and PCLMULQDQ suitable for Westmere and later. Improvement varies 171762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner# from one benchmark and µ-arch to another. Below are interval values 18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# for 163- and 571-bit ECDH benchmarks relative to compiler-generated 19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# code: 20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# PIII 16%-30% 22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# P4 12%-12% 23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Opteron 18%-40% 24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Core2 19%-44% 25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Atom 38%-64% 26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) 27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) 28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Note that above improvement coefficients are not coefficients for 30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result 31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark 32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# is more and more dominated by other subroutines, most notably by 33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# BN_GF2m_mod[_mul]_arr... 34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush(@INC,"${dir}","${dir}../../perlasm"); 37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrequire "x86asm.pl"; 38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); 40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$sse2=0; 42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&external_label("OPENSSL_ia32cap_P") if ($sse2); 45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$a="eax"; 47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$b="ebx"; 48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($a1,$a2,$a4)=("ecx","edx","ebp"); 49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$R="mm0"; 51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@T=("mm1","mm2"); 52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); 53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@i=("esi","edi"); 54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom if (!$x86only) { 56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_begin_B("_mul_1x1_mmx"); 57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &sub ("esp",32+4); 58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a1,$a); 59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &lea ($a2,&DWP(0,$a,$a)); 60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and ($a1,0x3fffffff); 61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &lea ($a4,&DWP(0,$a2,$a2)); 62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(0*4,"esp"),0); 63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and ($a2,0x7fffffff); 64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movd ($A,$a); 65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movd ($B,$b); 66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(1*4,"esp"),$a1); # a1 67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a1,$a2); # a1^a2 68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($B31,$B31); 69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($B30,$B30); 70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(2*4,"esp"),$a2); # a2 71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a2,$a4); # a2^a4 72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(3*4,"esp"),$a1); # a1^a2 73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pcmpgtd($B31,$A); # broadcast 31st bit 74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &paddd ($A,$A); # $A<<=1 75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(4*4,"esp"),$a4); # a4 77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a4,$a2); # a2=a4^a2^a4 78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pand ($B31,$B); 79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pcmpgtd($B30,$A); # broadcast 30th bit 80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(5*4,"esp"),$a1); # a1^a4 81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a4,$a1); # a1^a2^a4 82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &psllq ($B31,31); 83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pand ($B30,$B); 84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(6*4,"esp"),$a2); # a2^a4 85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[0],0x7); 86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a4,@i[0]); 88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[0],$b); 89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($b,3); 90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[1],$a4); 91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &psllq ($B30,30); 92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[1],$b); 93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($b,3); 94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movd ($R,&DWP(0,"esp",@i[0],4)); 95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[0],$a4); 96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[0],$b); 97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($b,3); 98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom for($n=1;$n<9;$n++) { 99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movd (@T[1],&DWP(0,"esp",@i[1],4)); 100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[1],$a4); 101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &psllq (@T[1],3*$n); 102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[1],$b); 103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($b,3); 104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($R,@T[1]); 105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push(@i,shift(@i)); push(@T,shift(@T)); 107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movd (@T[1],&DWP(0,"esp",@i[1],4)); 109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($R,$B30); 110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &psllq (@T[1],3*$n++); 111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($R,@T[1]); 112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movd (@T[0],&DWP(0,"esp",@i[0],4)); 114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($R,$B31); 115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &psllq (@T[0],3*$n); 116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &add ("esp",32+4); 117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($R,@T[0]); 118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &ret (); 119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_end_B("_mul_1x1_mmx"); 120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($lo,$hi)=("eax","edx"); 123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@T=("ecx","ebp"); 124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_begin_B("_mul_1x1_ialu"); 126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &sub ("esp",32+4); 127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a1,$a); 128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &lea ($a2,&DWP(0,$a,$a)); 129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &lea ($a4,&DWP(0,"",$a,4)); 130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and ($a1,0x3fffffff); 131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &lea (@i[1],&DWP(0,$lo,$lo)); 132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &sar ($lo,31); # broadcast 31st bit 133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(0*4,"esp"),0); 134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and ($a2,0x7fffffff); 135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(1*4,"esp"),$a1); # a1 136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a1,$a2); # a1^a2 137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(2*4,"esp"),$a2); # a2 138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a2,$a4); # a2^a4 139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(3*4,"esp"),$a1); # a1^a2 140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(4*4,"esp"),$a4); # a4 142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a4,$a2); # a2=a4^a2^a4 143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(5*4,"esp"),$a1); # a1^a4 144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a4,$a1); # a1^a2^a4 145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &sar (@i[1],31); # broardcast 30th bit 146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and ($lo,$b); 147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(6*4,"esp"),$a2); # a2^a4 148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[1],$b); 149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($hi,$lo); 151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shl ($lo,31); 152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@T[0],@i[1]); 153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($hi,1); 154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[0],0x7); 156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shl (@i[1],30); 157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[0],$b); 158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr (@T[0],2); 159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,@i[1]); 160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($b,3); 162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[1],0x7); # 5-byte instruction!? 163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[1],$b); 164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($b,3); 165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($hi,@T[0]); 166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,&DWP(0,"esp",@i[0],4)); 167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[0],0x7); 168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[0],$b); 169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($b,3); 170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom for($n=1;$n<9;$n++) { 171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@T[1],&DWP(0,"esp",@i[1],4)); 172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[1],0x7); 173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@T[0],@T[1]); 174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shl (@T[1],3*$n); 175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &and (@i[1],$b); 176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr (@T[0],32-3*$n); 177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,@T[1]); 178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr ($b,3); 179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($hi,@T[0]); 180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom push(@i,shift(@i)); push(@T,shift(@T)); 182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@T[1],&DWP(0,"esp",@i[1],4)); 184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@T[0],@T[1]); 185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shl (@T[1],3*$n); 186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[1],&DWP(0,"esp",@i[0],4)); 187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr (@T[0],32-3*$n); $n++; 188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@i[0],@i[1]); 189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,@T[1]); 190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shl (@i[1],3*$n); 191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($hi,@T[0]); 192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shr (@i[0],32-3*$n); 193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,@i[1]); 194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($hi,@i[0]); 195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &add ("esp",32+4); 197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &ret (); 198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_end_B("_mul_1x1_ialu"); 199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); 201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_begin_B("bn_GF2m_mul_2x2"); 202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif (!$x86only) { 203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &picmeup("edx","OPENSSL_ia32cap_P"); 204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ("eax",&DWP(0,"edx")); 205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ("edx",&DWP(4,"edx")); 206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &test ("eax",1<<23); # check MMX bit 207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &jz (&label("ialu")); 208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($sse2) { 209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &test ("eax",1<<24); # check FXSR bit 210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &jz (&label("mmx")); 211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &test ("edx",1<<1); # check PCLMULQDQ bit 212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &jz (&label("mmx")); 213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movups ("xmm0",&QWP(8,"esp")); 215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &shufps ("xmm0","xmm0",0b10110001); 216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pclmulqdq ("xmm0","xmm0",1); 217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ("eax",&DWP(4,"esp")); 218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movups (&QWP(0,"eax"),"xmm0"); 219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &ret (); 220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&set_label("mmx",16); 222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &push ("ebp"); 224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &push ("ebx"); 225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &push ("esi"); 226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &push ("edi"); 227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a,&wparam(1)); 228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($b,&wparam(3)); 2291762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner &call ("_mul_1x1_mmx"); # a1·b1 230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movq ("mm7",$R); 231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a,&wparam(2)); 233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($b,&wparam(4)); 2341762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner &call ("_mul_1x1_mmx"); # a0·b0 235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movq ("mm6",$R); 236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a,&wparam(1)); 238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($b,&wparam(3)); 239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a,&wparam(2)); 240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($b,&wparam(4)); 2411762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) 242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($R,"mm7"); 243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a,&wparam(0)); 2441762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movq ($A,$R); 247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &psllq ($R,32); 248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pop ("edi"); 249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &psrlq ($A,32); 250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pop ("esi"); 251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($R,"mm6"); 252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pop ("ebx"); 253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pxor ($A,"mm7"); 254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movq (&QWP(0,$a),$R); 255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pop ("ebp"); 256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &movq (&QWP(8,$a),$A); 257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &emms (); 258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &ret (); 259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&set_label("ialu",16); 260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &push ("ebp"); 262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &push ("ebx"); 263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &push ("esi"); 264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &push ("edi"); 265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &stack_push(4+1); 266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a,&wparam(1)); 268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($b,&wparam(3)); 2691762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner &call ("_mul_1x1_ialu"); # a1·b1 270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(8,"esp"),$lo); 271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(12,"esp"),$hi); 272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a,&wparam(2)); 274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($b,&wparam(4)); 2751762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner &call ("_mul_1x1_ialu"); # a0·b0 276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(0,"esp"),$lo); 277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(4,"esp"),$hi); 278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($a,&wparam(1)); 280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ($b,&wparam(3)); 281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($a,&wparam(2)); 282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($b,&wparam(4)); 2831762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) 284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov ("ebp",&wparam(0)); 286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom @r=("ebx","ecx","edi","esi"); 287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@r[0],&DWP(0,"esp")); 288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@r[1],&DWP(4,"esp")); 289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@r[2],&DWP(8,"esp")); 290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (@r[3],&DWP(12,"esp")); 291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,$hi); 293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($hi,@r[1]); 294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,@r[0]); 295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(0,"ebp"),@r[0]); 296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($hi,@r[2]); 297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(12,"ebp"),@r[3]); 298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,@r[3]); 299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &stack_pop(4+1); 300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($hi,@r[3]); 301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pop ("edi"); 302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &xor ($lo,$hi); 303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pop ("esi"); 304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(8,"ebp"),$hi); 305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pop ("ebx"); 306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &mov (&DWP(4,"ebp"),$lo); 307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &pop ("ebp"); 308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &ret (); 309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_end_B("bn_GF2m_mul_2x2"); 310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); 312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&asm_finish(); 314