1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl
2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and
6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further
7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/.
8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# May 2011
11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# the time being... Except that it has three code paths: pure integer
15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# code suitable for any x86 CPU, MMX code suitable for PIII and later
16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# and PCLMULQDQ suitable for Westmere and later. Improvement varies
171762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner# from one benchmark and µ-arch to another. Below are interval values
18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# code:
20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# PIII		16%-30%
22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# P4		12%-12%
23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Opteron	18%-40%
24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Core2		19%-44%
25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Atom		38%-64%
26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Note that above improvement coefficients are not coefficients for
30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# is more and more dominated by other subroutines, most notably by
33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# BN_GF2m_mod[_mul]_arr...
34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush(@INC,"${dir}","${dir}../../perlasm");
37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrequire "x86asm.pl";
38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$sse2=0;
42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&external_label("OPENSSL_ia32cap_P") if ($sse2);
45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$a="eax";
47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$b="ebx";
48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($a1,$a2,$a4)=("ecx","edx","ebp");
49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$R="mm0";
51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@T=("mm1","mm2");
52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@i=("esi","edi");
54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					if (!$x86only) {
56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_begin_B("_mul_1x1_mmx");
57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&sub	("esp",32+4);
58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	($a1,$a);
59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &lea	($a2,&DWP(0,$a,$a));
60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &and	($a1,0x3fffffff);
61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &lea	($a4,&DWP(0,$a2,$a2));
62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(0*4,"esp"),0);
63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &and	($a2,0x7fffffff);
64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movd	($A,$a);
65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movd	($B,$b);
66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a1,$a2);		# a1^a2
68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($B31,$B31);
69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($B30,$B30);
70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a2,$a4);		# a2^a4
72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pcmpgtd($B31,$A);		# broadcast 31st bit
74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&paddd	($A,$A);		# $A<<=1
75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a4,$a2);		# a2=a4^a2^a4
78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pand	($B31,$B);
79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pcmpgtd($B30,$A);		# broadcast 30th bit
80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a4,$a1);		# a1^a2^a4
82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&psllq	($B31,31);
83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pand	($B30,$B);
84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@i[0],0x7);
86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	($a4,@i[0]);
88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&and	(@i[0],$b);
89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	($b,3);
90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@i[1],$a4);
91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&psllq	($B30,30);
92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&and	(@i[1],$b);
93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	($b,3);
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movd	($R,&DWP(0,"esp",@i[0],4));
95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@i[0],$a4);
96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&and	(@i[0],$b);
97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	($b,3);
98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	for($n=1;$n<9;$n++) {
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&mov	(@i[1],$a4);
101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&psllq	(@T[1],3*$n);
102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&and	(@i[1],$b);
103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&shr	($b,3);
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&pxor	($R,@T[1]);
105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		push(@i,shift(@i)); push(@T,shift(@T));
107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	}
108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($R,$B30);
110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&psllq	(@T[1],3*$n++);
111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($R,@T[1]);
112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($R,$B31);
115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&psllq	(@T[0],3*$n);
116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&add	("esp",32+4);
117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($R,@T[0]);
118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ret	();
119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_end_B("_mul_1x1_mmx");
120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					}
121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($lo,$hi)=("eax","edx");
123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@T=("ecx","ebp");
124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_begin_B("_mul_1x1_ialu");
126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&sub	("esp",32+4);
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	($a1,$a);
128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &lea	($a2,&DWP(0,$a,$a));
129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &lea	($a4,&DWP(0,"",$a,4));
130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &and	($a1,0x3fffffff);
131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&lea	(@i[1],&DWP(0,$lo,$lo));
132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&sar	($lo,31);		# broadcast 31st bit
133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(0*4,"esp"),0);
134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &and	($a2,0x7fffffff);
135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a1,$a2);		# a1^a2
137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a2,$a4);		# a2^a4
139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a4,$a2);		# a2=a4^a2^a4
143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($a4,$a1);		# a1^a2^a4
145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&sar	(@i[1],31);		# broardcast 30th bit
146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&and	($lo,$b);
147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&and	(@i[1],$b);
149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($hi,$lo);
151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shl	($lo,31);
152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@T[0],@i[1]);
153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	($hi,1);
154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &mov	(@i[0],0x7);
156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shl	(@i[1],30);
157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &and	(@i[0],$b);
158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	(@T[0],2);
159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($lo,@i[1]);
160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	($b,3);
162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@i[1],0x7);		# 5-byte instruction!?
163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&and	(@i[1],$b);
164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	($b,3);
165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 &xor	($hi,@T[0]);
166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($lo,&DWP(0,"esp",@i[0],4));
167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@i[0],0x7);
168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&and	(@i[0],$b);
169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	($b,3);
170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	for($n=1;$n<9;$n++) {
171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&mov	(@i[1],0x7);
173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&mov	(@T[0],@T[1]);
174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&shl	(@T[1],3*$n);
175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&and	(@i[1],$b);
176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&shr	(@T[0],32-3*$n);
177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&xor	($lo,@T[1]);
178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&shr	($b,3);
179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		&xor	($hi,@T[0]);
180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		push(@i,shift(@i)); push(@T,shift(@T));
182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	}
183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@T[0],@T[1]);
185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shl	(@T[1],3*$n);
186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	(@T[0],32-3*$n);	$n++;
188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@i[0],@i[1]);
189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($lo,@T[1]);
190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shl	(@i[1],3*$n);
191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($hi,@T[0]);
192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shr	(@i[0],32-3*$n);
193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($lo,@i[1]);
194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($hi,@i[0]);
195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&add	("esp",32+4);
197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ret	();
198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_end_B("_mul_1x1_ialu");
199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_begin_B("bn_GF2m_mul_2x2");
202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif (!$x86only) {
203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&picmeup("edx","OPENSSL_ia32cap_P");
204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	("eax",&DWP(0,"edx"));
205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	("edx",&DWP(4,"edx"));
206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&test	("eax",1<<23);		# check MMX bit
207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&jz	(&label("ialu"));
208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($sse2) {
209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&test	("eax",1<<24);		# check FXSR bit
210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&jz	(&label("mmx"));
211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&test	("edx",1<<1);		# check PCLMULQDQ bit
212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&jz	(&label("mmx"));
213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movups		("xmm0",&QWP(8,"esp"));
215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&shufps		("xmm0","xmm0",0b10110001);
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pclmulqdq	("xmm0","xmm0",1);
217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov		("eax",&DWP(4,"esp"));
218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movups		(&QWP(0,"eax"),"xmm0");
219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ret	();
220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&set_label("mmx",16);
222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&push	("ebp");
224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&push	("ebx");
225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&push	("esi");
226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&push	("edi");
227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($a,&wparam(1));
228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($b,&wparam(3));
2291762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	&call	("_mul_1x1_mmx");	# a1·b1
230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movq	("mm7",$R);
231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($a,&wparam(2));
233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($b,&wparam(4));
2341762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	&call	("_mul_1x1_mmx");	# a0·b0
235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movq	("mm6",$R);
236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($a,&wparam(1));
238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($b,&wparam(3));
239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($a,&wparam(2));
240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($b,&wparam(4));
2411762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($R,"mm7");
243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($a,&wparam(0));
2441762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movq	($A,$R);
247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&psllq	($R,32);
248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pop	("edi");
249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&psrlq	($A,32);
250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pop	("esi");
251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($R,"mm6");
252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pop	("ebx");
253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pxor	($A,"mm7");
254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movq	(&QWP(0,$a),$R);
255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pop	("ebp");
256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&movq	(&QWP(8,$a),$A);
257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&emms	();
258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ret	();
259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&set_label("ialu",16);
260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&push	("ebp");
262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&push	("ebx");
263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&push	("esi");
264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&push	("edi");
265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&stack_push(4+1);
266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($a,&wparam(1));
268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($b,&wparam(3));
2691762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	&call	("_mul_1x1_ialu");	# a1·b1
270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(&DWP(8,"esp"),$lo);
271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(&DWP(12,"esp"),$hi);
272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($a,&wparam(2));
274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($b,&wparam(4));
2751762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	&call	("_mul_1x1_ialu");	# a0·b0
276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(&DWP(0,"esp"),$lo);
277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(&DWP(4,"esp"),$hi);
278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($a,&wparam(1));
280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	($b,&wparam(3));
281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($a,&wparam(2));
282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($b,&wparam(4));
2831762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	("ebp",&wparam(0));
286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		 @r=("ebx","ecx","edi","esi");
287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@r[0],&DWP(0,"esp"));
288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@r[1],&DWP(4,"esp"));
289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@r[2],&DWP(8,"esp"));
290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(@r[3],&DWP(12,"esp"));
291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($lo,$hi);
293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($hi,@r[1]);
294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($lo,@r[0]);
295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(&DWP(0,"ebp"),@r[0]);
296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($hi,@r[2]);
297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(&DWP(12,"ebp"),@r[3]);
298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($lo,@r[3]);
299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&stack_pop(4+1);
300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($hi,@r[3]);
301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pop	("edi");
302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&xor	($lo,$hi);
303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pop	("esi");
304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(&DWP(8,"ebp"),$hi);
305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pop	("ebx");
306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&mov	(&DWP(4,"ebp"),$lo);
307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&pop	("ebp");
308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ret	();
309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&function_end_B("bn_GF2m_mul_2x2");
310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&asm_finish();
314