1c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org#!/usr/bin/env perl
2c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
3c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# ====================================================================
4c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# project. The module is, however, dual licensed under OpenSSL and
6c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# CRYPTOGAMS licenses depending on where you obtain it. For further
7c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# details see http://www.openssl.org/~appro/cryptogams/.
8c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# ====================================================================
9c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
10c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# October 2005
11c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org#
12c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# This is a "teaser" code, as it can be improved in several ways...
13c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# First of all non-SSE2 path should be implemented (yes, for now it
14c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# performs Montgomery multiplication/convolution only on SSE2-capable
15c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# CPUs such as P4, others fall down to original code). Then inner loop
16c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# can be unrolled and modulo-scheduled to improve ILP and possibly
17c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# moved to 128-bit XMM register bank (though it would require input
18c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# rearrangement and/or increase bus bandwidth utilization). Dedicated
19c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# squaring procedure should give further performance improvement...
20c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
23c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# December 2006
24c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org#
25c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# Integer-only code [being equipped with dedicated squaring procedure]
27c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# gives ~40% on rsa512 sign benchmark...
28c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
29480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.orgpush(@INC,"${dir}","${dir}../../perlasm");
31c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.orgrequire "x86asm.pl";
32c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
33c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&asm_init($ARGV[0],$0);
34c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
35c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$sse2=0;
36c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.orgfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
38c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&external_label("OPENSSL_ia32cap_P") if ($sse2);
39c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
40c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&function_begin("bn_mul_mont");
41c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
42c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$i="edx";
43c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$j="ecx";
44c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$ap="esi";	$tp="esi";		# overlapping variables!!!
45c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$rp="edi";	$bp="edi";		# overlapping variables!!!
46c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$np="ebp";
47c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$num="ebx";
48c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
49c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$_num=&DWP(4*0,"esp");			# stack top layout
50c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$_rp=&DWP(4*1,"esp");
51c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$_ap=&DWP(4*2,"esp");
52c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$_bp=&DWP(4*3,"esp");
53c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$_np=&DWP(4*4,"esp");
54c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
55c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$_sp=&DWP(4*6,"esp");
56c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$_bpend=&DWP(4*7,"esp");
57c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$frame=32;				# size of above frame rounded up to 16n
58c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
59c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	("eax","eax");
60c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("edi",&wparam(5));	# int num
61c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	("edi",4);
62c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jl	(&label("just_leave"));
63c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
64c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	("esi",&wparam(0));	# put aside pointer to argument block
65c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	("edx",&wparam(1));	# load ap
66c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("ebp","esp");		# saved stack pointer!
67c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("edi",2);		# extra two words on top of tp
68c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&neg	("edi");
69c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
70c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&neg	("edi");
71c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
72c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# minimize cache contention by arraning 2K window between stack
73c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# pointer and ap argument [np is also position sensitive vector,
74c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# but it's assumed to be near ap, as it's allocated at ~same
75c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# time].
76c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax","esp");
77c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&sub	("eax","edx");
78c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&and	("eax",2047);
79c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
80c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
81c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	("edx","esp");
82c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&and	("edx",2048);
83c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	("edx",2048);
84c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&sub	("esp","edx");		# this splits them apart modulo 4096
85c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
86c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&and	("esp",-64);		# align to cache line
87c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
88c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	################################# load argument block...
89c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	#&mov	("edi",&DWP(5*4,"esi"));# int num
95c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
96c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
97c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_rp,"eax");		# ... save a copy of argument block
98c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_ap,"ebx");
99c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_bp,"ecx");
100c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_np,"edx");
101c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_n0,"esi");
102c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
103c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	#&mov	($_num,$num);		# redundant as $num is not reused
104c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_sp,"ebp");		# saved stack pointer!
105c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
106c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.orgif($sse2) {
107c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$acc0="mm0";	# mmx register bank layout
108c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$acc1="mm1";
109c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$car0="mm2";
110c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$car1="mm3";
111c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$mul0="mm4";
112c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$mul1="mm5";
113c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$temp="mm6";
114c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$mask="mm7";
115c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
116c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&picmeup("eax","OPENSSL_ia32cap_P");
117c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&bt	(&DWP(0,"eax"),26);
118c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jnc	(&label("non_sse2"));
119c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
120c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",-1);
121c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($mask,"eax");		# mask 32 lower bits
122c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
123c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($ap,$_ap);		# load input pointers
124c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($bp,$_bp);
125c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($np,$_np);
126c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
127c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($i,$i);		# i=0
128c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($j,$j);		# j=0
129c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
130c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($mul0,&DWP(0,$bp));		# bp[0]
131c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($mul1,&DWP(0,$ap));		# ap[0]
132c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($car1,&DWP(0,$np));		# np[0]
133c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
134c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
135c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	($car0,$mul1);
136c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	($acc0,$mul1);			# I wish movd worked for
137c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pand	($acc0,$mask);			# inter-register transfers
138c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
139c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($mul1,$_n0q);			# *=n0
140c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
141c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
142c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc0);
143c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
144c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($acc1,&DWP(4,$np));		# np[1]
145c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($acc0,&DWP(4,$ap));		# ap[1]
146c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
147c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car0,32);
148c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car1,32);
149c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
150c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&inc	($j);				# j++
151c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("1st",16);
152c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
153c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($acc1,$mul1);			# np[j]*m1
154c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car0,$acc0);			# +=c0
155c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc1);			# +=c1
156c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
157c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	($acc0,$car0);
158c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pand	($acc0,$mask);
159c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
160c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
161c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
162c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car0,32);
163c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
164c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car1,32);
165c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
166c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));
167c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$num);
168c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jl	(&label("1st"));
169c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
170c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
171c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($acc1,$mul1);			# np[num-1]*m1
172c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car0,$acc0);			# +=c0
173c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc1);			# +=c1
174c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
175c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	($acc0,$car0);
176c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pand	($acc0,$mask);
177c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
178c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
179c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
180c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car0,32);
181c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car1,32);
182c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
183c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$car0);
184c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
185c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
186c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&inc	($i);				# i++
187c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("outer");
188c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($j,$j);			# j=0
189c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
190c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
191c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($mul1,&DWP(0,$ap));		# ap[0]
192c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
193c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($car1,&DWP(0,$np));		# np[0]
194c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
195c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
196c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($mul1,$temp);			# +=tp[0]
197c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	($acc0,$mul1);
198c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	($car0,$mul1);
199c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pand	($acc0,$mask);
200c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
201c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($mul1,$_n0q);			# *=n0
202c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
203c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($car1,$mul1);
204c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc0);
205c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
206c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
207c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($acc1,&DWP(4,$np));		# np[1]
208c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($acc0,&DWP(4,$ap));		# ap[1]
209c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
210c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car0,32);
211c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car1,32);
212c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car0,$temp);			# +=tp[1]
213c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
214c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&inc	($j);				# j++
215c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&dec	($num);
216c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("inner");
217c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
218c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($acc1,$mul1);			# np[j]*m1
219c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car0,$acc0);			# +=c0
220c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc1);			# +=c1
221c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
222c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	($acc0,$car0);
223c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
224c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pand	($acc0,$mask);
225c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
226c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
227c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
228c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car0,32);
229c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
230c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car1,32);
231c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car0,$temp);			# +=tp[j+1]
232c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
233c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&dec	($num);
234c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));		# j++
235c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jnz	(&label("inner"));
236c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
237c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($num,$j);
238c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
239c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pmuludq($acc1,$mul1);			# np[num-1]*m1
240c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car0,$acc0);			# +=c0
241c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc1);			# +=c1
242c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
243c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	($acc0,$car0);
244c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&pand	($acc0,$mask);
245c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
246c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
247c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car0,32);
248c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&psrlq	($car1,32);
249c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
250c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
251c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$car0);
252c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&paddq	($car1,$temp);
253c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
254c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
255c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($i,&DWP(1,$i));		# i++
256c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($i,$num);
257c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jle	(&label("outer"));
258c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
259c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&emms	();				# done with mmx bank
260c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jmp	(&label("common_tail"));
261c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
262c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("non_sse2",16);
263c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org}
264c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
265c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.orgif (0) {
266c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("esp",$_sp);
267c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	("eax","eax");	# signal "not fast enough [yet]"
268c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jmp	(&label("just_leave"));
269c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# While the below code provides competitive performance for
270c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# all key lengthes on modern Intel cores, it's still more
271c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
272c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# means compared to the original integer-only assembler.
273c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# 512-bit RSA sign is better by ~40%, but that's about all
274c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	# one can say about all CPUs...
275c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org} else {
276c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$inp="esi";	# integer path uses these registers differently
277c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$word="edi";
278c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$carry="ebp";
279c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
280c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($inp,$_ap);
281c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($carry,&DWP(1,$num));
282c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($word,$_bp);
283c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($j,$j);				# j=0
284c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("edx",$inp);
285c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&and	($carry,1);				# see if num is even
286c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&sub	("edx",$word);				# see if ap==bp
287c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
288c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&or	($carry,"edx");
289c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($word,&DWP(0,$word));			# bp[0]
290c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jz	(&label("bn_sqr_mont"));
291c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_bpend,"eax");
292c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp));
293c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	("edx","edx");
294c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
295c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("mull",16);
296c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
297c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[j]*bp[0]
298c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,"eax");
299c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));
300c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
301c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
302c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$num);
303c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
304c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jl	(&label("mull"));
305c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
306c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
307c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[num-1]*bp[0]
308c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	($word,$_n0);
309c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",$carry);
310c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	($inp,$_np);
311c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
312c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
313c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
314c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
315c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($j,$j);
316c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
317c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
318c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
319c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp));			# np[0]
320c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[0]*m
321c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
322c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(4,$inp));			# np[1]
323c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
324c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&inc	($j);
325c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
326c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jmp	(&label("2ndmadd"));
327c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
328c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("1stmadd",16);
329c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
330c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[j]*bp[i]
331c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
332c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));
333c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
334c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,"eax");
335c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
336c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
337c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$num);
338c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
339c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jl	(&label("1stmadd"));
340c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
341c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
342c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[num-1]*bp[i]
343c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
344c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	($word,$_n0);
345c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
346c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	($inp,$_np);
347c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,"eax");
348c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
349c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
350c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
351c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($j,$j);
352c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
353c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
354c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	($j,0);
355c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	("eax",&DWP(0,$inp));			# np[0]
356c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
357c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
358c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
359c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[0]*m
360c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
361c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(4,$inp));			# np[1]
362c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
363c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($j,1);
364c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
365c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("2ndmadd",16);
366c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
367c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[j]*m
368c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
369c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));
370c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
371c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,"eax");
372c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
373c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
374c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$num);
375c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
376c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jl	(&label("2ndmadd"));
377c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
378c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
379c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[j]*m
380c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
381c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
382c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,"eax");
383c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
384c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
385c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
386c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	("eax","eax");
387c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	($j,$_bp);				# &bp[i]
388c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
389c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
390c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &lea	($j,&DWP(4,$j));
391c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
392c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &cmp	($j,$_bpend);
393c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
394c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&je	(&label("common_tail"));
395c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
396c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($word,&DWP(0,$j));			# bp[i+1]
397c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($inp,$_ap);
398c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_bp,$j);				# &bp[++i]
399c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($j,$j);
400c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	("edx","edx");
401c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp));
402c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jmp	(&label("1stmadd"));
403c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
404c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("bn_sqr_mont",16);
405c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$sbit=$num;
406c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_num,$num);
407c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_bp,$j);				# i=0
408c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
409c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",$word);				# ap[0]
410c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[0]*ap[0]
411c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
412c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($sbit,"edx");
413c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&shr	("edx",1);
414c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&and	($sbit,1);
415c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&inc	($j);
416c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("sqr",16);
417c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
418c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
419c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[j]*ap[0]
420c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",$carry);
421c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));
422c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
423c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($carry,&DWP(0,$sbit,"eax",2));
424c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&shr	("eax",31);
425c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$_num);
426c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($sbit,"eax");
427c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
428c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jl	(&label("sqr"));
429c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
430c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
431c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
432c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[num-1]*ap[0]
433c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",$carry);
434c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	($word,$_n0);
435c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
436c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	($inp,$_np);
437c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($carry,&DWP(0,$sbit,"eax",2));
438c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
439c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&shr	("eax",31);
440c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
441c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
442c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($carry,&DWP(0,"eax","edx",2));
443c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	 &mov	("eax",&DWP(0,$inp));			# np[0]
444c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&shr	("edx",31);
445c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
446c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
447c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
448c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[0]*m
449c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
450c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($num,$j);
451c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
452c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(4,$inp));			# np[1]
453c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($j,1);
454c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
455c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("3rdmadd",16);
456c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
457c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[j]*m
458c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
459c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
460c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,"eax");
461c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
462c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
463c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
464c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
465c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
466c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[j+1]*m
467c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
468c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(2,$j));
469c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
470c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,"eax");
471c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
472c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
473c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$num);
474c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
475c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jl	(&label("3rdmadd"));
476c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
477c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
478c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[j]*m
479c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
480c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
481c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,"eax");
482c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
483c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
484c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
485c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($j,$_bp);				# i
486c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	("eax","eax");
487c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($inp,$_ap);
488c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
489c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
490c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
491c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$num);
492c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
493c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&je	(&label("common_tail"));
494c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
495c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
496c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));
497c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",$word);
498c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($_bp,$j);				# ++i
499c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[i]*ap[i]
500c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
501c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
502c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
503c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($carry,$carry);
504c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$num);
505c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));
506c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&je	(&label("sqrlast"));
507c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
508c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($sbit,"edx");				# zaps $num
509c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&shr	("edx",1);
510c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&and	($sbit,1);
511c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("sqradd",16);
512c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
513c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
514c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# ap[j]*ap[i]
515c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",$carry);
516c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($carry,&DWP(0,"eax","eax"));
517c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
518c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&shr	("eax",31);
519c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
520c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($j,&DWP(1,$j));
521c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("eax",0);
522c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	($carry,$sbit);
523c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("eax",0);
524c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&cmp	($j,$_num);
525c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
526c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($sbit,"eax");
527c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jle	(&label("sqradd"));
528c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
529c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($carry,"edx");
530480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	&add	("edx","edx");
531c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&shr	($carry,31);
532480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	&add	("edx",$sbit);
533480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	&adc	($carry,0);
534c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("sqrlast");
535c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($word,$_n0);
536c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($inp,$_np);
537c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
538c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
539c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
540c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$inp));			# np[0]
541c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	($carry,0);
542c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
543c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
544c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
545c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mul	($word);				# np[0]*m
546c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
547c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($num,&DWP(-1,$j));
548c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&adc	("edx",0);
549c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($j,1);
550c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(4,$inp));			# np[1]
551c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
552c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jmp	(&label("3rdmadd"));
553c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org}
554c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
555c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("common_tail",16);
556c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($np,$_np);			# load modulus pointer
557c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($rp,$_rp);			# load result pointer
558c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
559c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
560c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$tp));		# tp[0]
561c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($j,$num);			# j=num-1
562c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&xor	($i,$i);			# i=0 and clear CF!
563c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
564c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("sub",16);
565c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&sbb	("eax",&DWP(0,$np,$i,4));
566c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
567c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&dec	($j);				# doesn't affect CF!
568c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
569c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&lea	($i,&DWP(1,$i));		# i++
570c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jge	(&label("sub"));
571c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
572c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&sbb	("eax",0);			# handle upmost overflow bit
573c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&and	($tp,"eax");
574c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&not	("eax");
575c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	($np,$rp);
576c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&and	($np,"eax");
577c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&or	($tp,$np);			# tp=carry?tp:rp
578c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
579c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("copy",16);				# copy or in-place refresh
580c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",&DWP(0,$tp,$num,4));
581c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
582c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
583c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&dec	($num);
584c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&jge	(&label("copy"));
585c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
586c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("esp",$_sp);		# pull saved stack pointer
587c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org	&mov	("eax",1);
588c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&set_label("just_leave");
589c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&function_end("bn_mul_mont");
590c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
591c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
592c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org
593c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&asm_finish();
594