1#!/usr/bin/env perl
2
3##############################################################################
4#                                                                            #
5#  Copyright (c) 2012, Intel Corporation                                     #
6#                                                                            #
7#  All rights reserved.                                                      #
8#                                                                            #
9#  Redistribution and use in source and binary forms, with or without        #
10#  modification, are permitted provided that the following conditions are    #
11#  met:                                                                      #
12#                                                                            #
13#  *  Redistributions of source code must retain the above copyright         #
14#     notice, this list of conditions and the following disclaimer.          #
15#                                                                            #
16#  *  Redistributions in binary form must reproduce the above copyright      #
17#     notice, this list of conditions and the following disclaimer in the    #
18#     documentation and/or other materials provided with the                 #
19#     distribution.                                                          #
20#                                                                            #
21#  *  Neither the name of the Intel Corporation nor the names of its         #
22#     contributors may be used to endorse or promote products derived from   #
23#     this software without specific prior written permission.               #
24#                                                                            #
25#                                                                            #
26#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
27#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
28#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
29#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
30#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
31#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
32#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
33#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
34#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
35#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
36#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
37#                                                                            #
38##############################################################################
39# Developers and authors:                                                    #
40# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
41# (1) Intel Architecture Group, Microprocessor and Chipset Development,      #
42#     Israel Development Center, Haifa, Israel                               #
43# (2) University of Haifa                                                    #
44##############################################################################
45# Reference:                                                                 #
46# [1] S. Gueron, "Efficient Software Implementations of Modular              #
47#     Exponentiation", http://eprint.iacr.org/2011/239                       #
48# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             #
49#     IEEE Proceedings of 9th International Conference on Information        #
50#     Technology: New Generations (ITNG 2012), 821-823 (2012).               #
51# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52#     Journal of Cryptographic Engineering 2:31-43 (2012).                   #
53# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
54#     resistant 512-bit and 1024-bit modular exponentiation for optimizing   #
55#     RSA1024 and RSA2048 on x86_64 platforms",                              #
56#     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57##############################################################################
58
59# While original submission covers 512- and 1024-bit exponentiation,
60# this module is limited to 512-bit version only (and as such
61# accelerates RSA1024 sign). This is because improvement for longer
62# keys is not high enough to justify the effort, highest measured
63# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64# for the moment of this writing!] Nor does this module implement
65# "monolithic" complete exponentiation jumbo-subroutine, but adheres
66# to more modular mixture of C and assembly. And it's optimized even
67# for processors other than Intel Core family (see table below for
68# improvement coefficients).
69# 						<appro@openssl.org>
70#
71# RSA1024 sign/sec	this/original	|this/rsax(*)	this/fips(*)
72#			----------------+---------------------------
73# Opteron		+13%		|+5%		+20%
74# Bulldozer		-0%		|-1%		+10%
75# P4			+11%		|+7%		+8%
76# Westmere		+5%		|+14%		+17%
77# Sandy Bridge		+2%		|+12%		+29%
78# Ivy Bridge		+1%		|+11%		+35%
79# Haswell(**)		-0%		|+12%		+39%
80# Atom			+13%		|+11%		+4%
81# VIA Nano		+70%		|+9%		+25%
82#
83# (*)	rsax engine and fips numbers are presented for reference
84#	purposes;
85# (**)	MULX was attempted, but found to give only marginal improvement;
86
87$flavour = shift;
88$output  = shift;
89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96die "can't locate x86_64-xlate.pl";
97
98open OUT,"| \"$^X\" $xlate $flavour $output";
99*STDOUT=*OUT;
100
101# In upstream, this is controlled by shelling out to the compiler to check
102# versions, but BoringSSL is intended to be used with pre-generated perlasm
103# output, so this isn't useful anyway.
104#
105# TODO(davidben): Enable this after testing. $addx goes up to 1.
106$addx = 0;
107
108($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp");	# common internal API
109{
110my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
111
112$code.=<<___;
113.text
114
115.extern	OPENSSL_ia32cap_P
116
117.globl	rsaz_512_sqr
118.type	rsaz_512_sqr,\@function,5
119.align	32
120rsaz_512_sqr:				# 25-29% faster than rsaz_512_mul
121	push	%rbx
122	push	%rbp
123	push	%r12
124	push	%r13
125	push	%r14
126	push	%r15
127
128	subq	\$128+24, %rsp
129.Lsqr_body:
130	movq	$mod, %rbp		# common argument
131	movq	($inp), %rdx
132	movq	8($inp), %rax
133	movq	$n0, 128(%rsp)
134___
135$code.=<<___ if ($addx);
136	movl	\$0x80100,%r11d
137	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
138	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
139	je	.Loop_sqrx
140___
141$code.=<<___;
142	jmp	.Loop_sqr
143
144.align	32
145.Loop_sqr:
146	movl	$times,128+8(%rsp)
147#first iteration
148	movq	%rdx, %rbx
149	mulq	%rdx
150	movq	%rax, %r8
151	movq	16($inp), %rax
152	movq	%rdx, %r9
153
154	mulq	%rbx
155	addq	%rax, %r9
156	movq	24($inp), %rax
157	movq	%rdx, %r10
158	adcq	\$0, %r10
159
160	mulq	%rbx
161	addq	%rax, %r10
162	movq	32($inp), %rax
163	movq	%rdx, %r11
164	adcq	\$0, %r11
165
166	mulq	%rbx
167	addq	%rax, %r11
168	movq	40($inp), %rax
169	movq	%rdx, %r12
170	adcq	\$0, %r12
171
172	mulq	%rbx
173	addq	%rax, %r12
174	movq	48($inp), %rax
175	movq	%rdx, %r13
176	adcq	\$0, %r13
177
178	mulq	%rbx
179	addq	%rax, %r13
180	movq	56($inp), %rax
181	movq	%rdx, %r14
182	adcq	\$0, %r14
183
184	mulq	%rbx
185	addq	%rax, %r14
186	movq	%rbx, %rax
187	movq	%rdx, %r15
188	adcq	\$0, %r15
189
190	addq	%r8, %r8		#shlq	\$1, %r8
191	movq	%r9, %rcx
192	adcq	%r9, %r9		#shld	\$1, %r8, %r9
193
194	mulq	%rax
195	movq	%rax, (%rsp)
196	addq	%rdx, %r8
197	adcq	\$0, %r9
198
199	movq	%r8, 8(%rsp)
200	shrq	\$63, %rcx
201
202#second iteration
203	movq	8($inp), %r8
204	movq	16($inp), %rax
205	mulq	%r8
206	addq	%rax, %r10
207	movq	24($inp), %rax
208	movq	%rdx, %rbx
209	adcq	\$0, %rbx
210
211	mulq	%r8
212	addq	%rax, %r11
213	movq	32($inp), %rax
214	adcq	\$0, %rdx
215	addq	%rbx, %r11
216	movq	%rdx, %rbx
217	adcq	\$0, %rbx
218
219	mulq	%r8
220	addq	%rax, %r12
221	movq	40($inp), %rax
222	adcq	\$0, %rdx
223	addq	%rbx, %r12
224	movq	%rdx, %rbx
225	adcq	\$0, %rbx
226
227	mulq	%r8
228	addq	%rax, %r13
229	movq	48($inp), %rax
230	adcq	\$0, %rdx
231	addq	%rbx, %r13
232	movq	%rdx, %rbx
233	adcq	\$0, %rbx
234
235	mulq	%r8
236	addq	%rax, %r14
237	movq	56($inp), %rax
238	adcq	\$0, %rdx
239	addq	%rbx, %r14
240	movq	%rdx, %rbx
241	adcq	\$0, %rbx
242
243	mulq	%r8
244	addq	%rax, %r15
245	movq	%r8, %rax
246	adcq	\$0, %rdx
247	addq	%rbx, %r15
248	movq	%rdx, %r8
249	movq	%r10, %rdx
250	adcq	\$0, %r8
251
252	add	%rdx, %rdx
253	lea	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
254	movq	%r11, %rbx
255	adcq	%r11, %r11		#shld	\$1, %r10, %r11
256
257	mulq	%rax
258	addq	%rax, %r9
259	adcq	%rdx, %r10
260	adcq	\$0, %r11
261
262	movq	%r9, 16(%rsp)
263	movq	%r10, 24(%rsp)
264	shrq	\$63, %rbx
265
266#third iteration
267	movq	16($inp), %r9
268	movq	24($inp), %rax
269	mulq	%r9
270	addq	%rax, %r12
271	movq	32($inp), %rax
272	movq	%rdx, %rcx
273	adcq	\$0, %rcx
274
275	mulq	%r9
276	addq	%rax, %r13
277	movq	40($inp), %rax
278	adcq	\$0, %rdx
279	addq	%rcx, %r13
280	movq	%rdx, %rcx
281	adcq	\$0, %rcx
282
283	mulq	%r9
284	addq	%rax, %r14
285	movq	48($inp), %rax
286	adcq	\$0, %rdx
287	addq	%rcx, %r14
288	movq	%rdx, %rcx
289	adcq	\$0, %rcx
290
291	mulq	%r9
292	 movq	%r12, %r10
293	 lea	(%rbx,%r12,2), %r12	#shld	\$1, %rbx, %r12
294	addq	%rax, %r15
295	movq	56($inp), %rax
296	adcq	\$0, %rdx
297	addq	%rcx, %r15
298	movq	%rdx, %rcx
299	adcq	\$0, %rcx
300
301	mulq	%r9
302	 shrq	\$63, %r10
303	addq	%rax, %r8
304	movq	%r9, %rax
305	adcq	\$0, %rdx
306	addq	%rcx, %r8
307	movq	%rdx, %r9
308	adcq	\$0, %r9
309
310	movq	%r13, %rcx
311	leaq	(%r10,%r13,2), %r13	#shld	\$1, %r12, %r13
312
313	mulq	%rax
314	addq	%rax, %r11
315	adcq	%rdx, %r12
316	adcq	\$0, %r13
317
318	movq	%r11, 32(%rsp)
319	movq	%r12, 40(%rsp)
320	shrq	\$63, %rcx
321
322#fourth iteration
323	movq	24($inp), %r10
324	movq	32($inp), %rax
325	mulq	%r10
326	addq	%rax, %r14
327	movq	40($inp), %rax
328	movq	%rdx, %rbx
329	adcq	\$0, %rbx
330
331	mulq	%r10
332	addq	%rax, %r15
333	movq	48($inp), %rax
334	adcq	\$0, %rdx
335	addq	%rbx, %r15
336	movq	%rdx, %rbx
337	adcq	\$0, %rbx
338
339	mulq	%r10
340	 movq	%r14, %r12
341	 leaq	(%rcx,%r14,2), %r14	#shld	\$1, %rcx, %r14
342	addq	%rax, %r8
343	movq	56($inp), %rax
344	adcq	\$0, %rdx
345	addq	%rbx, %r8
346	movq	%rdx, %rbx
347	adcq	\$0, %rbx
348
349	mulq	%r10
350	 shrq	\$63, %r12
351	addq	%rax, %r9
352	movq	%r10, %rax
353	adcq	\$0, %rdx
354	addq	%rbx, %r9
355	movq	%rdx, %r10
356	adcq	\$0, %r10
357
358	movq	%r15, %rbx
359	leaq	(%r12,%r15,2),%r15	#shld	\$1, %r14, %r15
360
361	mulq	%rax
362	addq	%rax, %r13
363	adcq	%rdx, %r14
364	adcq	\$0, %r15
365
366	movq	%r13, 48(%rsp)
367	movq	%r14, 56(%rsp)
368	shrq	\$63, %rbx
369
370#fifth iteration
371	movq	32($inp), %r11
372	movq	40($inp), %rax
373	mulq	%r11
374	addq	%rax, %r8
375	movq	48($inp), %rax
376	movq	%rdx, %rcx
377	adcq	\$0, %rcx
378
379	mulq	%r11
380	addq	%rax, %r9
381	movq	56($inp), %rax
382	adcq	\$0, %rdx
383	 movq	%r8, %r12
384	 leaq	(%rbx,%r8,2), %r8	#shld	\$1, %rbx, %r8
385	addq	%rcx, %r9
386	movq	%rdx, %rcx
387	adcq	\$0, %rcx
388
389	mulq	%r11
390	 shrq	\$63, %r12
391	addq	%rax, %r10
392	movq	%r11, %rax
393	adcq	\$0, %rdx
394	addq	%rcx, %r10
395	movq	%rdx, %r11
396	adcq	\$0, %r11
397
398	movq	%r9, %rcx
399	leaq	(%r12,%r9,2), %r9	#shld	\$1, %r8, %r9
400
401	mulq	%rax
402	addq	%rax, %r15
403	adcq	%rdx, %r8
404	adcq	\$0, %r9
405
406	movq	%r15, 64(%rsp)
407	movq	%r8, 72(%rsp)
408	shrq	\$63, %rcx
409
410#sixth iteration
411	movq	40($inp), %r12
412	movq	48($inp), %rax
413	mulq	%r12
414	addq	%rax, %r10
415	movq	56($inp), %rax
416	movq	%rdx, %rbx
417	adcq	\$0, %rbx
418
419	mulq	%r12
420	addq	%rax, %r11
421	movq	%r12, %rax
422	 movq	%r10, %r15
423	 leaq	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
424	adcq	\$0, %rdx
425	 shrq	\$63, %r15
426	addq	%rbx, %r11
427	movq	%rdx, %r12
428	adcq	\$0, %r12
429
430	movq	%r11, %rbx
431	leaq	(%r15,%r11,2), %r11	#shld	\$1, %r10, %r11
432
433	mulq	%rax
434	addq	%rax, %r9
435	adcq	%rdx, %r10
436	adcq	\$0, %r11
437
438	movq	%r9, 80(%rsp)
439	movq	%r10, 88(%rsp)
440
441#seventh iteration
442	movq	48($inp), %r13
443	movq	56($inp), %rax
444	mulq	%r13
445	addq	%rax, %r12
446	movq	%r13, %rax
447	movq	%rdx, %r13
448	adcq	\$0, %r13
449
450	xorq	%r14, %r14
451	shlq	\$1, %rbx
452	adcq	%r12, %r12		#shld	\$1, %rbx, %r12
453	adcq	%r13, %r13		#shld	\$1, %r12, %r13
454	adcq	%r14, %r14		#shld	\$1, %r13, %r14
455
456	mulq	%rax
457	addq	%rax, %r11
458	adcq	%rdx, %r12
459	adcq	\$0, %r13
460
461	movq	%r11, 96(%rsp)
462	movq	%r12, 104(%rsp)
463
464#eighth iteration
465	movq	56($inp), %rax
466	mulq	%rax
467	addq	%rax, %r13
468	adcq	\$0, %rdx
469
470	addq	%rdx, %r14
471
472	movq	%r13, 112(%rsp)
473	movq	%r14, 120(%rsp)
474
475	movq	(%rsp), %r8
476	movq	8(%rsp), %r9
477	movq	16(%rsp), %r10
478	movq	24(%rsp), %r11
479	movq	32(%rsp), %r12
480	movq	40(%rsp), %r13
481	movq	48(%rsp), %r14
482	movq	56(%rsp), %r15
483
484	call	__rsaz_512_reduce
485
486	addq	64(%rsp), %r8
487	adcq	72(%rsp), %r9
488	adcq	80(%rsp), %r10
489	adcq	88(%rsp), %r11
490	adcq	96(%rsp), %r12
491	adcq	104(%rsp), %r13
492	adcq	112(%rsp), %r14
493	adcq	120(%rsp), %r15
494	sbbq	%rcx, %rcx
495
496	call	__rsaz_512_subtract
497
498	movq	%r8, %rdx
499	movq	%r9, %rax
500	movl	128+8(%rsp), $times
501	movq	$out, $inp
502
503	decl	$times
504	jnz	.Loop_sqr
505___
506if ($addx) {
507$code.=<<___;
508	jmp	.Lsqr_tail
509
510.align	32
511.Loop_sqrx:
512	movl	$times,128+8(%rsp)
513	movq	$out, %xmm0		# off-load
514	movq	%rbp, %xmm1		# off-load
515#first iteration
516	mulx	%rax, %r8, %r9
517
518	mulx	16($inp), %rcx, %r10
519	xor	%rbp, %rbp		# cf=0, of=0
520
521	mulx	24($inp), %rax, %r11
522	adcx	%rcx, %r9
523
524	mulx	32($inp), %rcx, %r12
525	adcx	%rax, %r10
526
527	mulx	40($inp), %rax, %r13
528	adcx	%rcx, %r11
529
530	.byte	0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($inp), %rcx, %r14
531	adcx	%rax, %r12
532	adcx	%rcx, %r13
533
534	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r15
535	adcx	%rax, %r14
536	adcx	%rbp, %r15		# %rbp is 0
537
538	mov	%r9, %rcx
539	shld	\$1, %r8, %r9
540	shl	\$1, %r8
541
542	xor	%ebp, %ebp
543	mulx	%rdx, %rax, %rdx
544	adcx	%rdx, %r8
545	 mov	8($inp), %rdx
546	adcx	%rbp, %r9
547
548	mov	%rax, (%rsp)
549	mov	%r8, 8(%rsp)
550
551#second iteration
552	mulx	16($inp), %rax, %rbx
553	adox	%rax, %r10
554	adcx	%rbx, %r11
555
556	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r8
557	adox	$out, %r11
558	adcx	%r8, %r12
559
560	mulx	32($inp), %rax, %rbx
561	adox	%rax, %r12
562	adcx	%rbx, %r13
563
564	mulx	40($inp), $out, %r8
565	adox	$out, %r13
566	adcx	%r8, %r14
567
568	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
569	adox	%rax, %r14
570	adcx	%rbx, %r15
571
572	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r8
573	adox	$out, %r15
574	adcx	%rbp, %r8
575	adox	%rbp, %r8
576
577	mov	%r11, %rbx
578	shld	\$1, %r10, %r11
579	shld	\$1, %rcx, %r10
580
581	xor	%ebp,%ebp
582	mulx	%rdx, %rax, %rcx
583	 mov	16($inp), %rdx
584	adcx	%rax, %r9
585	adcx	%rcx, %r10
586	adcx	%rbp, %r11
587
588	mov	%r9, 16(%rsp)
589	.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00		# mov	%r10, 24(%rsp)
590
591#third iteration
592	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r9
593	adox	$out, %r12
594	adcx	%r9, %r13
595
596	mulx	32($inp), %rax, %rcx
597	adox	%rax, %r13
598	adcx	%rcx, %r14
599
600	mulx	40($inp), $out, %r9
601	adox	$out, %r14
602	adcx	%r9, %r15
603
604	.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rcx
605	adox	%rax, %r15
606	adcx	%rcx, %r8
607
608	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r9
609	adox	$out, %r8
610	adcx	%rbp, %r9
611	adox	%rbp, %r9
612
613	mov	%r13, %rcx
614	shld	\$1, %r12, %r13
615	shld	\$1, %rbx, %r12
616
617	xor	%ebp, %ebp
618	mulx	%rdx, %rax, %rdx
619	adcx	%rax, %r11
620	adcx	%rdx, %r12
621	 mov	24($inp), %rdx
622	adcx	%rbp, %r13
623
624	mov	%r11, 32(%rsp)
625	.byte	0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00		# mov	%r12, 40(%rsp)
626
627#fourth iteration
628	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00	# mulx	32($inp), %rax, %rbx
629	adox	%rax, %r14
630	adcx	%rbx, %r15
631
632	mulx	40($inp), $out, %r10
633	adox	$out, %r15
634	adcx	%r10, %r8
635
636	mulx	48($inp), %rax, %rbx
637	adox	%rax, %r8
638	adcx	%rbx, %r9
639
640	mulx	56($inp), $out, %r10
641	adox	$out, %r9
642	adcx	%rbp, %r10
643	adox	%rbp, %r10
644
645	.byte	0x66
646	mov	%r15, %rbx
647	shld	\$1, %r14, %r15
648	shld	\$1, %rcx, %r14
649
650	xor	%ebp, %ebp
651	mulx	%rdx, %rax, %rdx
652	adcx	%rax, %r13
653	adcx	%rdx, %r14
654	 mov	32($inp), %rdx
655	adcx	%rbp, %r15
656
657	mov	%r13, 48(%rsp)
658	mov	%r14, 56(%rsp)
659
660#fifth iteration
661	.byte	0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00	# mulx	40($inp), $out, %r11
662	adox	$out, %r8
663	adcx	%r11, %r9
664
665	mulx	48($inp), %rax, %rcx
666	adox	%rax, %r9
667	adcx	%rcx, %r10
668
669	mulx	56($inp), $out, %r11
670	adox	$out, %r10
671	adcx	%rbp, %r11
672	adox	%rbp, %r11
673
674	mov	%r9, %rcx
675	shld	\$1, %r8, %r9
676	shld	\$1, %rbx, %r8
677
678	xor	%ebp, %ebp
679	mulx	%rdx, %rax, %rdx
680	adcx	%rax, %r15
681	adcx	%rdx, %r8
682	 mov	40($inp), %rdx
683	adcx	%rbp, %r9
684
685	mov	%r15, 64(%rsp)
686	mov	%r8, 72(%rsp)
687
688#sixth iteration
689	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
690	adox	%rax, %r10
691	adcx	%rbx, %r11
692
693	.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r12
694	adox	$out, %r11
695	adcx	%rbp, %r12
696	adox	%rbp, %r12
697
698	mov	%r11, %rbx
699	shld	\$1, %r10, %r11
700	shld	\$1, %rcx, %r10
701
702	xor	%ebp, %ebp
703	mulx	%rdx, %rax, %rdx
704	adcx	%rax, %r9
705	adcx	%rdx, %r10
706	 mov	48($inp), %rdx
707	adcx	%rbp, %r11
708
709	mov	%r9, 80(%rsp)
710	mov	%r10, 88(%rsp)
711
712#seventh iteration
713	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r13
714	adox	%rax, %r12
715	adox	%rbp, %r13
716
717	xor	%r14, %r14
718	shld	\$1, %r13, %r14
719	shld	\$1, %r12, %r13
720	shld	\$1, %rbx, %r12
721
722	xor	%ebp, %ebp
723	mulx	%rdx, %rax, %rdx
724	adcx	%rax, %r11
725	adcx	%rdx, %r12
726	 mov	56($inp), %rdx
727	adcx	%rbp, %r13
728
729	.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00		# mov	%r11, 96(%rsp)
730	.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00		# mov	%r12, 104(%rsp)
731
732#eighth iteration
733	mulx	%rdx, %rax, %rdx
734	adox	%rax, %r13
735	adox	%rbp, %rdx
736
737	.byte	0x66
738	add	%rdx, %r14
739
740	movq	%r13, 112(%rsp)
741	movq	%r14, 120(%rsp)
742	movq	%xmm0, $out
743	movq	%xmm1, %rbp
744
745	movq	128(%rsp), %rdx		# pull $n0
746	movq	(%rsp), %r8
747	movq	8(%rsp), %r9
748	movq	16(%rsp), %r10
749	movq	24(%rsp), %r11
750	movq	32(%rsp), %r12
751	movq	40(%rsp), %r13
752	movq	48(%rsp), %r14
753	movq	56(%rsp), %r15
754
755	call	__rsaz_512_reducex
756
757	addq	64(%rsp), %r8
758	adcq	72(%rsp), %r9
759	adcq	80(%rsp), %r10
760	adcq	88(%rsp), %r11
761	adcq	96(%rsp), %r12
762	adcq	104(%rsp), %r13
763	adcq	112(%rsp), %r14
764	adcq	120(%rsp), %r15
765	sbbq	%rcx, %rcx
766
767	call	__rsaz_512_subtract
768
769	movq	%r8, %rdx
770	movq	%r9, %rax
771	movl	128+8(%rsp), $times
772	movq	$out, $inp
773
774	decl	$times
775	jnz	.Loop_sqrx
776
777.Lsqr_tail:
778___
779}
780$code.=<<___;
781
782	leaq	128+24+48(%rsp), %rax
783	movq	-48(%rax), %r15
784	movq	-40(%rax), %r14
785	movq	-32(%rax), %r13
786	movq	-24(%rax), %r12
787	movq	-16(%rax), %rbp
788	movq	-8(%rax), %rbx
789	leaq	(%rax), %rsp
790.Lsqr_epilogue:
791	ret
792.size	rsaz_512_sqr,.-rsaz_512_sqr
793___
794}
795{
796my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
797$code.=<<___;
798.globl	rsaz_512_mul
799.type	rsaz_512_mul,\@function,5
800.align	32
801rsaz_512_mul:
802	push	%rbx
803	push	%rbp
804	push	%r12
805	push	%r13
806	push	%r14
807	push	%r15
808
809	subq	\$128+24, %rsp
810.Lmul_body:
811	movq	$out, %xmm0		# off-load arguments
812	movq	$mod, %xmm1
813	movq	$n0, 128(%rsp)
814___
815$code.=<<___ if ($addx);
816	movl	\$0x80100,%r11d
817	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
818	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
819	je	.Lmulx
820___
821$code.=<<___;
822	movq	($bp), %rbx		# pass b[0]
823	movq	$bp, %rbp		# pass argument
824	call	__rsaz_512_mul
825
826	movq	%xmm0, $out
827	movq	%xmm1, %rbp
828
829	movq	(%rsp), %r8
830	movq	8(%rsp), %r9
831	movq	16(%rsp), %r10
832	movq	24(%rsp), %r11
833	movq	32(%rsp), %r12
834	movq	40(%rsp), %r13
835	movq	48(%rsp), %r14
836	movq	56(%rsp), %r15
837
838	call	__rsaz_512_reduce
839___
840$code.=<<___ if ($addx);
841	jmp	.Lmul_tail
842
843.align	32
844.Lmulx:
845	movq	$bp, %rbp		# pass argument
846	movq	($bp), %rdx		# pass b[0]
847	call	__rsaz_512_mulx
848
849	movq	%xmm0, $out
850	movq	%xmm1, %rbp
851
852	movq	128(%rsp), %rdx		# pull $n0
853	movq	(%rsp), %r8
854	movq	8(%rsp), %r9
855	movq	16(%rsp), %r10
856	movq	24(%rsp), %r11
857	movq	32(%rsp), %r12
858	movq	40(%rsp), %r13
859	movq	48(%rsp), %r14
860	movq	56(%rsp), %r15
861
862	call	__rsaz_512_reducex
863.Lmul_tail:
864___
865$code.=<<___;
866	addq	64(%rsp), %r8
867	adcq	72(%rsp), %r9
868	adcq	80(%rsp), %r10
869	adcq	88(%rsp), %r11
870	adcq	96(%rsp), %r12
871	adcq	104(%rsp), %r13
872	adcq	112(%rsp), %r14
873	adcq	120(%rsp), %r15
874	sbbq	%rcx, %rcx
875
876	call	__rsaz_512_subtract
877
878	leaq	128+24+48(%rsp), %rax
879	movq	-48(%rax), %r15
880	movq	-40(%rax), %r14
881	movq	-32(%rax), %r13
882	movq	-24(%rax), %r12
883	movq	-16(%rax), %rbp
884	movq	-8(%rax), %rbx
885	leaq	(%rax), %rsp
886.Lmul_epilogue:
887	ret
888.size	rsaz_512_mul,.-rsaz_512_mul
889___
890}
891{
892my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
893$code.=<<___;
894.globl	rsaz_512_mul_gather4
895.type	rsaz_512_mul_gather4,\@function,6
896.align	32
897rsaz_512_mul_gather4:
898	push	%rbx
899	push	%rbp
900	push	%r12
901	push	%r13
902	push	%r14
903	push	%r15
904
905	mov	$pwr, $pwr
906	subq	\$128+24, %rsp
907.Lmul_gather4_body:
908___
909$code.=<<___ if ($addx);
910	movl	\$0x80100,%r11d
911	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
912	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
913	je	.Lmulx_gather
914___
915$code.=<<___;
916	movl	64($bp,$pwr,4), %eax
917	movq	$out, %xmm0		# off-load arguments
918	movl	($bp,$pwr,4), %ebx
919	movq	$mod, %xmm1
920	movq	$n0, 128(%rsp)
921
922	shlq	\$32, %rax
923	or	%rax, %rbx
924	movq	($ap), %rax
925	 movq	8($ap), %rcx
926	 leaq	128($bp,$pwr,4), %rbp
927	mulq	%rbx			# 0 iteration
928	movq	%rax, (%rsp)
929	movq	%rcx, %rax
930	movq	%rdx, %r8
931
932	mulq	%rbx
933	 movd	(%rbp), %xmm4
934	addq	%rax, %r8
935	movq	16($ap), %rax
936	movq	%rdx, %r9
937	adcq	\$0, %r9
938
939	mulq	%rbx
940	 movd	64(%rbp), %xmm5
941	addq	%rax, %r9
942	movq	24($ap), %rax
943	movq	%rdx, %r10
944	adcq	\$0, %r10
945
946	mulq	%rbx
947	 pslldq	\$4, %xmm5
948	addq	%rax, %r10
949	movq	32($ap), %rax
950	movq	%rdx, %r11
951	adcq	\$0, %r11
952
953	mulq	%rbx
954	 por	%xmm5, %xmm4
955	addq	%rax, %r11
956	movq	40($ap), %rax
957	movq	%rdx, %r12
958	adcq	\$0, %r12
959
960	mulq	%rbx
961	addq	%rax, %r12
962	movq	48($ap), %rax
963	movq	%rdx, %r13
964	adcq	\$0, %r13
965
966	mulq	%rbx
967	 leaq	128(%rbp), %rbp
968	addq	%rax, %r13
969	movq	56($ap), %rax
970	movq	%rdx, %r14
971	adcq	\$0, %r14
972
973	mulq	%rbx
974	 movq	%xmm4, %rbx
975	addq	%rax, %r14
976	 movq	($ap), %rax
977	movq	%rdx, %r15
978	adcq	\$0, %r15
979
980	leaq	8(%rsp), %rdi
981	movl	\$7, %ecx
982	jmp	.Loop_mul_gather
983
984.align	32
985.Loop_mul_gather:
986	mulq	%rbx
987	addq	%rax, %r8
988	movq	8($ap), %rax
989	movq	%r8, (%rdi)
990	movq	%rdx, %r8
991	adcq	\$0, %r8
992
993	mulq	%rbx
994	 movd	(%rbp), %xmm4
995	addq	%rax, %r9
996	movq	16($ap), %rax
997	adcq	\$0, %rdx
998	addq	%r9, %r8
999	movq	%rdx, %r9
1000	adcq	\$0, %r9
1001
1002	mulq	%rbx
1003	 movd	64(%rbp), %xmm5
1004	addq	%rax, %r10
1005	movq	24($ap), %rax
1006	adcq	\$0, %rdx
1007	addq	%r10, %r9
1008	movq	%rdx, %r10
1009	adcq	\$0, %r10
1010
1011	mulq	%rbx
1012	 pslldq	\$4, %xmm5
1013	addq	%rax, %r11
1014	movq	32($ap), %rax
1015	adcq	\$0, %rdx
1016	addq	%r11, %r10
1017	movq	%rdx, %r11
1018	adcq	\$0, %r11
1019
1020	mulq	%rbx
1021	 por	%xmm5, %xmm4
1022	addq	%rax, %r12
1023	movq	40($ap), %rax
1024	adcq	\$0, %rdx
1025	addq	%r12, %r11
1026	movq	%rdx, %r12
1027	adcq	\$0, %r12
1028
1029	mulq	%rbx
1030	addq	%rax, %r13
1031	movq	48($ap), %rax
1032	adcq	\$0, %rdx
1033	addq	%r13, %r12
1034	movq	%rdx, %r13
1035	adcq	\$0, %r13
1036
1037	mulq	%rbx
1038	addq	%rax, %r14
1039	movq	56($ap), %rax
1040	adcq	\$0, %rdx
1041	addq	%r14, %r13
1042	movq	%rdx, %r14
1043	adcq	\$0, %r14
1044
1045	mulq	%rbx
1046	 movq	%xmm4, %rbx
1047	addq	%rax, %r15
1048	 movq	($ap), %rax
1049	adcq	\$0, %rdx
1050	addq	%r15, %r14
1051	movq	%rdx, %r15
1052	adcq	\$0, %r15
1053
1054	leaq	128(%rbp), %rbp
1055	leaq	8(%rdi), %rdi
1056
1057	decl	%ecx
1058	jnz	.Loop_mul_gather
1059
1060	movq	%r8, (%rdi)
1061	movq	%r9, 8(%rdi)
1062	movq	%r10, 16(%rdi)
1063	movq	%r11, 24(%rdi)
1064	movq	%r12, 32(%rdi)
1065	movq	%r13, 40(%rdi)
1066	movq	%r14, 48(%rdi)
1067	movq	%r15, 56(%rdi)
1068
1069	movq	%xmm0, $out
1070	movq	%xmm1, %rbp
1071
1072	movq	(%rsp), %r8
1073	movq	8(%rsp), %r9
1074	movq	16(%rsp), %r10
1075	movq	24(%rsp), %r11
1076	movq	32(%rsp), %r12
1077	movq	40(%rsp), %r13
1078	movq	48(%rsp), %r14
1079	movq	56(%rsp), %r15
1080
1081	call	__rsaz_512_reduce
1082___
1083$code.=<<___ if ($addx);
1084	jmp	.Lmul_gather_tail
1085
1086.align	32
1087.Lmulx_gather:
1088	mov	64($bp,$pwr,4), %eax
1089	movq	$out, %xmm0		# off-load arguments
1090	lea	128($bp,$pwr,4), %rbp
1091	mov	($bp,$pwr,4), %edx
1092	movq	$mod, %xmm1
1093	mov	$n0, 128(%rsp)
1094
1095	shl	\$32, %rax
1096	or	%rax, %rdx
1097	mulx	($ap), %rbx, %r8	# 0 iteration
1098	mov	%rbx, (%rsp)
1099	xor	%edi, %edi		# cf=0, of=0
1100
1101	mulx	8($ap), %rax, %r9
1102	 movd	(%rbp), %xmm4
1103
1104	mulx	16($ap), %rbx, %r10
1105	 movd	64(%rbp), %xmm5
1106	adcx	%rax, %r8
1107
1108	mulx	24($ap), %rax, %r11
1109	 pslldq	\$4, %xmm5
1110	adcx	%rbx, %r9
1111
1112	mulx	32($ap), %rbx, %r12
1113	 por	%xmm5, %xmm4
1114	adcx	%rax, %r10
1115
1116	mulx	40($ap), %rax, %r13
1117	adcx	%rbx, %r11
1118
1119	mulx	48($ap), %rbx, %r14
1120	 lea	128(%rbp), %rbp
1121	adcx	%rax, %r12
1122
1123	mulx	56($ap), %rax, %r15
1124	 movq	%xmm4, %rdx
1125	adcx	%rbx, %r13
1126	adcx	%rax, %r14
1127	mov	%r8, %rbx
1128	adcx	%rdi, %r15		# %rdi is 0
1129
1130	mov	\$-7, %rcx
1131	jmp	.Loop_mulx_gather
1132
1133.align	32
1134.Loop_mulx_gather:
1135	mulx	($ap), %rax, %r8
1136	adcx	%rax, %rbx
1137	adox	%r9, %r8
1138
1139	mulx	8($ap), %rax, %r9
1140	.byte	0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00		# movd	(%rbp), %xmm4
1141	adcx	%rax, %r8
1142	adox	%r10, %r9
1143
1144	mulx	16($ap), %rax, %r10
1145	 movd	64(%rbp), %xmm5
1146	 lea	128(%rbp), %rbp
1147	adcx	%rax, %r9
1148	adox	%r11, %r10
1149
1150	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
1151	 pslldq	\$4, %xmm5
1152	 por	%xmm5, %xmm4
1153	adcx	%rax, %r10
1154	adox	%r12, %r11
1155
1156	mulx	32($ap), %rax, %r12
1157	adcx	%rax, %r11
1158	adox	%r13, %r12
1159
1160	mulx	40($ap), %rax, %r13
1161	adcx	%rax, %r12
1162	adox	%r14, %r13
1163
1164	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
1165	adcx	%rax, %r13
1166	adox	%r15, %r14
1167
1168	mulx	56($ap), %rax, %r15
1169	 movq	%xmm4, %rdx
1170	 mov	%rbx, 64(%rsp,%rcx,8)
1171	adcx	%rax, %r14
1172	adox	%rdi, %r15
1173	mov	%r8, %rbx
1174	adcx	%rdi, %r15		# cf=0
1175
1176	inc	%rcx			# of=0
1177	jnz	.Loop_mulx_gather
1178
1179	mov	%r8, 64(%rsp)
1180	mov	%r9, 64+8(%rsp)
1181	mov	%r10, 64+16(%rsp)
1182	mov	%r11, 64+24(%rsp)
1183	mov	%r12, 64+32(%rsp)
1184	mov	%r13, 64+40(%rsp)
1185	mov	%r14, 64+48(%rsp)
1186	mov	%r15, 64+56(%rsp)
1187
1188	movq	%xmm0, $out
1189	movq	%xmm1, %rbp
1190
1191	mov	128(%rsp), %rdx		# pull $n0
1192	mov	(%rsp), %r8
1193	mov	8(%rsp), %r9
1194	mov	16(%rsp), %r10
1195	mov	24(%rsp), %r11
1196	mov	32(%rsp), %r12
1197	mov	40(%rsp), %r13
1198	mov	48(%rsp), %r14
1199	mov	56(%rsp), %r15
1200
1201	call	__rsaz_512_reducex
1202
1203.Lmul_gather_tail:
1204___
1205$code.=<<___;
1206	addq	64(%rsp), %r8
1207	adcq	72(%rsp), %r9
1208	adcq	80(%rsp), %r10
1209	adcq	88(%rsp), %r11
1210	adcq	96(%rsp), %r12
1211	adcq	104(%rsp), %r13
1212	adcq	112(%rsp), %r14
1213	adcq	120(%rsp), %r15
1214	sbbq	%rcx, %rcx
1215
1216	call	__rsaz_512_subtract
1217
1218	leaq	128+24+48(%rsp), %rax
1219	movq	-48(%rax), %r15
1220	movq	-40(%rax), %r14
1221	movq	-32(%rax), %r13
1222	movq	-24(%rax), %r12
1223	movq	-16(%rax), %rbp
1224	movq	-8(%rax), %rbx
1225	leaq	(%rax), %rsp
1226.Lmul_gather4_epilogue:
1227	ret
1228.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1229___
1230}
1231{
1232my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1233$code.=<<___;
1234.globl	rsaz_512_mul_scatter4
1235.type	rsaz_512_mul_scatter4,\@function,6
1236.align	32
1237rsaz_512_mul_scatter4:
1238	push	%rbx
1239	push	%rbp
1240	push	%r12
1241	push	%r13
1242	push	%r14
1243	push	%r15
1244
1245	mov	$pwr, $pwr
1246	subq	\$128+24, %rsp
1247.Lmul_scatter4_body:
1248	leaq	($tbl,$pwr,4), $tbl
1249	movq	$out, %xmm0		# off-load arguments
1250	movq	$mod, %xmm1
1251	movq	$tbl, %xmm2
1252	movq	$n0, 128(%rsp)
1253
1254	movq	$out, %rbp
1255___
1256$code.=<<___ if ($addx);
1257	movl	\$0x80100,%r11d
1258	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1259	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1260	je	.Lmulx_scatter
1261___
1262$code.=<<___;
1263	movq	($out),%rbx		# pass b[0]
1264	call	__rsaz_512_mul
1265
1266	movq	%xmm0, $out
1267	movq	%xmm1, %rbp
1268
1269	movq	(%rsp), %r8
1270	movq	8(%rsp), %r9
1271	movq	16(%rsp), %r10
1272	movq	24(%rsp), %r11
1273	movq	32(%rsp), %r12
1274	movq	40(%rsp), %r13
1275	movq	48(%rsp), %r14
1276	movq	56(%rsp), %r15
1277
1278	call	__rsaz_512_reduce
1279___
1280$code.=<<___ if ($addx);
1281	jmp	.Lmul_scatter_tail
1282
1283.align	32
1284.Lmulx_scatter:
1285	movq	($out), %rdx		# pass b[0]
1286	call	__rsaz_512_mulx
1287
1288	movq	%xmm0, $out
1289	movq	%xmm1, %rbp
1290
1291	movq	128(%rsp), %rdx		# pull $n0
1292	movq	(%rsp), %r8
1293	movq	8(%rsp), %r9
1294	movq	16(%rsp), %r10
1295	movq	24(%rsp), %r11
1296	movq	32(%rsp), %r12
1297	movq	40(%rsp), %r13
1298	movq	48(%rsp), %r14
1299	movq	56(%rsp), %r15
1300
1301	call	__rsaz_512_reducex
1302
1303.Lmul_scatter_tail:
1304___
1305$code.=<<___;
1306	addq	64(%rsp), %r8
1307	adcq	72(%rsp), %r9
1308	adcq	80(%rsp), %r10
1309	adcq	88(%rsp), %r11
1310	adcq	96(%rsp), %r12
1311	adcq	104(%rsp), %r13
1312	adcq	112(%rsp), %r14
1313	adcq	120(%rsp), %r15
1314	movq	%xmm2, $inp
1315	sbbq	%rcx, %rcx
1316
1317	call	__rsaz_512_subtract
1318
1319	movl	%r8d, 64*0($inp)	# scatter
1320	shrq	\$32, %r8
1321	movl	%r9d, 64*2($inp)
1322	shrq	\$32, %r9
1323	movl	%r10d, 64*4($inp)
1324	shrq	\$32, %r10
1325	movl	%r11d, 64*6($inp)
1326	shrq	\$32, %r11
1327	movl	%r12d, 64*8($inp)
1328	shrq	\$32, %r12
1329	movl	%r13d, 64*10($inp)
1330	shrq	\$32, %r13
1331	movl	%r14d, 64*12($inp)
1332	shrq	\$32, %r14
1333	movl	%r15d, 64*14($inp)
1334	shrq	\$32, %r15
1335	movl	%r8d, 64*1($inp)
1336	movl	%r9d, 64*3($inp)
1337	movl	%r10d, 64*5($inp)
1338	movl	%r11d, 64*7($inp)
1339	movl	%r12d, 64*9($inp)
1340	movl	%r13d, 64*11($inp)
1341	movl	%r14d, 64*13($inp)
1342	movl	%r15d, 64*15($inp)
1343
1344	leaq	128+24+48(%rsp), %rax
1345	movq	-48(%rax), %r15
1346	movq	-40(%rax), %r14
1347	movq	-32(%rax), %r13
1348	movq	-24(%rax), %r12
1349	movq	-16(%rax), %rbp
1350	movq	-8(%rax), %rbx
1351	leaq	(%rax), %rsp
1352.Lmul_scatter4_epilogue:
1353	ret
1354.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1355___
1356}
1357{
1358my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1359$code.=<<___;
1360.globl	rsaz_512_mul_by_one
1361.type	rsaz_512_mul_by_one,\@function,4
1362.align	32
1363rsaz_512_mul_by_one:
1364	push	%rbx
1365	push	%rbp
1366	push	%r12
1367	push	%r13
1368	push	%r14
1369	push	%r15
1370
1371	subq	\$128+24, %rsp
1372.Lmul_by_one_body:
1373___
1374$code.=<<___ if ($addx);
1375	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1376___
1377$code.=<<___;
1378	movq	$mod, %rbp	# reassign argument
1379	movq	$n0, 128(%rsp)
1380
1381	movq	($inp), %r8
1382	pxor	%xmm0, %xmm0
1383	movq	8($inp), %r9
1384	movq	16($inp), %r10
1385	movq	24($inp), %r11
1386	movq	32($inp), %r12
1387	movq	40($inp), %r13
1388	movq	48($inp), %r14
1389	movq	56($inp), %r15
1390
1391	movdqa	%xmm0, (%rsp)
1392	movdqa	%xmm0, 16(%rsp)
1393	movdqa	%xmm0, 32(%rsp)
1394	movdqa	%xmm0, 48(%rsp)
1395	movdqa	%xmm0, 64(%rsp)
1396	movdqa	%xmm0, 80(%rsp)
1397	movdqa	%xmm0, 96(%rsp)
1398___
1399$code.=<<___ if ($addx);
1400	andl	\$0x80100,%eax
1401	cmpl	\$0x80100,%eax		# check for MULX and ADO/CX
1402	je	.Lby_one_callx
1403___
1404$code.=<<___;
1405	call	__rsaz_512_reduce
1406___
1407$code.=<<___ if ($addx);
1408	jmp	.Lby_one_tail
1409.align	32
1410.Lby_one_callx:
1411	movq	128(%rsp), %rdx		# pull $n0
1412	call	__rsaz_512_reducex
1413.Lby_one_tail:
1414___
1415$code.=<<___;
1416	movq	%r8, ($out)
1417	movq	%r9, 8($out)
1418	movq	%r10, 16($out)
1419	movq	%r11, 24($out)
1420	movq	%r12, 32($out)
1421	movq	%r13, 40($out)
1422	movq	%r14, 48($out)
1423	movq	%r15, 56($out)
1424
1425	leaq	128+24+48(%rsp), %rax
1426	movq	-48(%rax), %r15
1427	movq	-40(%rax), %r14
1428	movq	-32(%rax), %r13
1429	movq	-24(%rax), %r12
1430	movq	-16(%rax), %rbp
1431	movq	-8(%rax), %rbx
1432	leaq	(%rax), %rsp
1433.Lmul_by_one_epilogue:
1434	ret
1435.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1436___
1437}
1438{	# __rsaz_512_reduce
1439	#
1440	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1441	# output:	%r8-%r15
1442	# clobbers:	everything except %rbp and %rdi
1443$code.=<<___;
1444.type	__rsaz_512_reduce,\@abi-omnipotent
1445.align	32
1446__rsaz_512_reduce:
1447	movq	%r8, %rbx
1448	imulq	128+8(%rsp), %rbx
1449	movq	0(%rbp), %rax
1450	movl	\$8, %ecx
1451	jmp	.Lreduction_loop
1452
1453.align	32
1454.Lreduction_loop:
1455	mulq	%rbx
1456	movq	8(%rbp), %rax
1457	negq	%r8
1458	movq	%rdx, %r8
1459	adcq	\$0, %r8
1460
1461	mulq	%rbx
1462	addq	%rax, %r9
1463	movq	16(%rbp), %rax
1464	adcq	\$0, %rdx
1465	addq	%r9, %r8
1466	movq	%rdx, %r9
1467	adcq	\$0, %r9
1468
1469	mulq	%rbx
1470	addq	%rax, %r10
1471	movq	24(%rbp), %rax
1472	adcq	\$0, %rdx
1473	addq	%r10, %r9
1474	movq	%rdx, %r10
1475	adcq	\$0, %r10
1476
1477	mulq	%rbx
1478	addq	%rax, %r11
1479	movq	32(%rbp), %rax
1480	adcq	\$0, %rdx
1481	addq	%r11, %r10
1482	 movq	128+8(%rsp), %rsi
1483	#movq	%rdx, %r11
1484	#adcq	\$0, %r11
1485	adcq	\$0, %rdx
1486	movq	%rdx, %r11
1487
1488	mulq	%rbx
1489	addq	%rax, %r12
1490	movq	40(%rbp), %rax
1491	adcq	\$0, %rdx
1492	 imulq	%r8, %rsi
1493	addq	%r12, %r11
1494	movq	%rdx, %r12
1495	adcq	\$0, %r12
1496
1497	mulq	%rbx
1498	addq	%rax, %r13
1499	movq	48(%rbp), %rax
1500	adcq	\$0, %rdx
1501	addq	%r13, %r12
1502	movq	%rdx, %r13
1503	adcq	\$0, %r13
1504
1505	mulq	%rbx
1506	addq	%rax, %r14
1507	movq	56(%rbp), %rax
1508	adcq	\$0, %rdx
1509	addq	%r14, %r13
1510	movq	%rdx, %r14
1511	adcq	\$0, %r14
1512
1513	mulq	%rbx
1514	 movq	%rsi, %rbx
1515	addq	%rax, %r15
1516	 movq	0(%rbp), %rax
1517	adcq	\$0, %rdx
1518	addq	%r15, %r14
1519	movq	%rdx, %r15
1520	adcq	\$0, %r15
1521
1522	decl	%ecx
1523	jne	.Lreduction_loop
1524
1525	ret
1526.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1527___
1528}
1529if ($addx) {
1530	# __rsaz_512_reducex
1531	#
1532	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1533	# output:	%r8-%r15
1534	# clobbers:	everything except %rbp and %rdi
1535$code.=<<___;
1536.type	__rsaz_512_reducex,\@abi-omnipotent
1537.align	32
1538__rsaz_512_reducex:
1539	#movq	128+8(%rsp), %rdx		# pull $n0
1540	imulq	%r8, %rdx
1541	xorq	%rsi, %rsi			# cf=0,of=0
1542	movl	\$8, %ecx
1543	jmp	.Lreduction_loopx
1544
1545.align	32
1546.Lreduction_loopx:
1547	mov	%r8, %rbx
1548	mulx	0(%rbp), %rax, %r8
1549	adcx	%rbx, %rax
1550	adox	%r9, %r8
1551
1552	mulx	8(%rbp), %rax, %r9
1553	adcx	%rax, %r8
1554	adox	%r10, %r9
1555
1556	mulx	16(%rbp), %rbx, %r10
1557	adcx	%rbx, %r9
1558	adox	%r11, %r10
1559
1560	mulx	24(%rbp), %rbx, %r11
1561	adcx	%rbx, %r10
1562	adox	%r12, %r11
1563
1564	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	32(%rbp), %rbx, %r12
1565	 mov	%rdx, %rax
1566	 mov	%r8, %rdx
1567	adcx	%rbx, %r11
1568	adox	%r13, %r12
1569
1570	 mulx	128+8(%rsp), %rbx, %rdx
1571	 mov	%rax, %rdx
1572
1573	mulx	40(%rbp), %rax, %r13
1574	adcx	%rax, %r12
1575	adox	%r14, %r13
1576
1577	.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00	# mulx	48(%rbp), %rax, %r14
1578	adcx	%rax, %r13
1579	adox	%r15, %r14
1580
1581	mulx	56(%rbp), %rax, %r15
1582	 mov	%rbx, %rdx
1583	adcx	%rax, %r14
1584	adox	%rsi, %r15			# %rsi is 0
1585	adcx	%rsi, %r15			# cf=0
1586
1587	decl	%ecx				# of=0
1588	jne	.Lreduction_loopx
1589
1590	ret
1591.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1592___
1593}
1594{	# __rsaz_512_subtract
1595	# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1596	# output:
1597	# clobbers: everything but %rdi, %rsi and %rbp
1598$code.=<<___;
1599.type	__rsaz_512_subtract,\@abi-omnipotent
1600.align	32
1601__rsaz_512_subtract:
1602	movq	%r8, ($out)
1603	movq	%r9, 8($out)
1604	movq	%r10, 16($out)
1605	movq	%r11, 24($out)
1606	movq	%r12, 32($out)
1607	movq	%r13, 40($out)
1608	movq	%r14, 48($out)
1609	movq	%r15, 56($out)
1610
1611	movq	0($mod), %r8
1612	movq	8($mod), %r9
1613	negq	%r8
1614	notq	%r9
1615	andq	%rcx, %r8
1616	movq	16($mod), %r10
1617	andq	%rcx, %r9
1618	notq	%r10
1619	movq	24($mod), %r11
1620	andq	%rcx, %r10
1621	notq	%r11
1622	movq	32($mod), %r12
1623	andq	%rcx, %r11
1624	notq	%r12
1625	movq	40($mod), %r13
1626	andq	%rcx, %r12
1627	notq	%r13
1628	movq	48($mod), %r14
1629	andq	%rcx, %r13
1630	notq	%r14
1631	movq	56($mod), %r15
1632	andq	%rcx, %r14
1633	notq	%r15
1634	andq	%rcx, %r15
1635
1636	addq	($out), %r8
1637	adcq	8($out), %r9
1638	adcq	16($out), %r10
1639	adcq	24($out), %r11
1640	adcq	32($out), %r12
1641	adcq	40($out), %r13
1642	adcq	48($out), %r14
1643	adcq	56($out), %r15
1644
1645	movq	%r8, ($out)
1646	movq	%r9, 8($out)
1647	movq	%r10, 16($out)
1648	movq	%r11, 24($out)
1649	movq	%r12, 32($out)
1650	movq	%r13, 40($out)
1651	movq	%r14, 48($out)
1652	movq	%r15, 56($out)
1653
1654	ret
1655.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1656___
1657}
1658{	# __rsaz_512_mul
1659	#
1660	# input: %rsi - ap, %rbp - bp
1661	# ouput:
1662	# clobbers: everything
1663my ($ap,$bp) = ("%rsi","%rbp");
1664$code.=<<___;
1665.type	__rsaz_512_mul,\@abi-omnipotent
1666.align	32
1667__rsaz_512_mul:
1668	leaq	8(%rsp), %rdi
1669
1670	movq	($ap), %rax
1671	mulq	%rbx
1672	movq	%rax, (%rdi)
1673	movq	8($ap), %rax
1674	movq	%rdx, %r8
1675
1676	mulq	%rbx
1677	addq	%rax, %r8
1678	movq	16($ap), %rax
1679	movq	%rdx, %r9
1680	adcq	\$0, %r9
1681
1682	mulq	%rbx
1683	addq	%rax, %r9
1684	movq	24($ap), %rax
1685	movq	%rdx, %r10
1686	adcq	\$0, %r10
1687
1688	mulq	%rbx
1689	addq	%rax, %r10
1690	movq	32($ap), %rax
1691	movq	%rdx, %r11
1692	adcq	\$0, %r11
1693
1694	mulq	%rbx
1695	addq	%rax, %r11
1696	movq	40($ap), %rax
1697	movq	%rdx, %r12
1698	adcq	\$0, %r12
1699
1700	mulq	%rbx
1701	addq	%rax, %r12
1702	movq	48($ap), %rax
1703	movq	%rdx, %r13
1704	adcq	\$0, %r13
1705
1706	mulq	%rbx
1707	addq	%rax, %r13
1708	movq	56($ap), %rax
1709	movq	%rdx, %r14
1710	adcq	\$0, %r14
1711
1712	mulq	%rbx
1713	addq	%rax, %r14
1714	 movq	($ap), %rax
1715	movq	%rdx, %r15
1716	adcq	\$0, %r15
1717
1718	leaq	8($bp), $bp
1719	leaq	8(%rdi), %rdi
1720
1721	movl	\$7, %ecx
1722	jmp	.Loop_mul
1723
1724.align	32
1725.Loop_mul:
1726	movq	($bp), %rbx
1727	mulq	%rbx
1728	addq	%rax, %r8
1729	movq	8($ap), %rax
1730	movq	%r8, (%rdi)
1731	movq	%rdx, %r8
1732	adcq	\$0, %r8
1733
1734	mulq	%rbx
1735	addq	%rax, %r9
1736	movq	16($ap), %rax
1737	adcq	\$0, %rdx
1738	addq	%r9, %r8
1739	movq	%rdx, %r9
1740	adcq	\$0, %r9
1741
1742	mulq	%rbx
1743	addq	%rax, %r10
1744	movq	24($ap), %rax
1745	adcq	\$0, %rdx
1746	addq	%r10, %r9
1747	movq	%rdx, %r10
1748	adcq	\$0, %r10
1749
1750	mulq	%rbx
1751	addq	%rax, %r11
1752	movq	32($ap), %rax
1753	adcq	\$0, %rdx
1754	addq	%r11, %r10
1755	movq	%rdx, %r11
1756	adcq	\$0, %r11
1757
1758	mulq	%rbx
1759	addq	%rax, %r12
1760	movq	40($ap), %rax
1761	adcq	\$0, %rdx
1762	addq	%r12, %r11
1763	movq	%rdx, %r12
1764	adcq	\$0, %r12
1765
1766	mulq	%rbx
1767	addq	%rax, %r13
1768	movq	48($ap), %rax
1769	adcq	\$0, %rdx
1770	addq	%r13, %r12
1771	movq	%rdx, %r13
1772	adcq	\$0, %r13
1773
1774	mulq	%rbx
1775	addq	%rax, %r14
1776	movq	56($ap), %rax
1777	adcq	\$0, %rdx
1778	addq	%r14, %r13
1779	movq	%rdx, %r14
1780	 leaq	8($bp), $bp
1781	adcq	\$0, %r14
1782
1783	mulq	%rbx
1784	addq	%rax, %r15
1785	 movq	($ap), %rax
1786	adcq	\$0, %rdx
1787	addq	%r15, %r14
1788	movq	%rdx, %r15
1789	adcq	\$0, %r15
1790
1791	leaq	8(%rdi), %rdi
1792
1793	decl	%ecx
1794	jnz	.Loop_mul
1795
1796	movq	%r8, (%rdi)
1797	movq	%r9, 8(%rdi)
1798	movq	%r10, 16(%rdi)
1799	movq	%r11, 24(%rdi)
1800	movq	%r12, 32(%rdi)
1801	movq	%r13, 40(%rdi)
1802	movq	%r14, 48(%rdi)
1803	movq	%r15, 56(%rdi)
1804
1805	ret
1806.size	__rsaz_512_mul,.-__rsaz_512_mul
1807___
1808}
1809if ($addx) {
1810	# __rsaz_512_mulx
1811	#
1812	# input: %rsi - ap, %rbp - bp
1813	# ouput:
1814	# clobbers: everything
1815my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1816$code.=<<___;
1817.type	__rsaz_512_mulx,\@abi-omnipotent
1818.align	32
1819__rsaz_512_mulx:
1820	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
1821	mov	\$-6, %rcx
1822
1823	mulx	8($ap), %rax, %r9
1824	movq	%rbx, 8(%rsp)
1825
1826	mulx	16($ap), %rbx, %r10
1827	adc	%rax, %r8
1828
1829	mulx	24($ap), %rax, %r11
1830	adc	%rbx, %r9
1831
1832	mulx	32($ap), %rbx, %r12
1833	adc	%rax, %r10
1834
1835	mulx	40($ap), %rax, %r13
1836	adc	%rbx, %r11
1837
1838	mulx	48($ap), %rbx, %r14
1839	adc	%rax, %r12
1840
1841	mulx	56($ap), %rax, %r15
1842	 mov	8($bp), %rdx
1843	adc	%rbx, %r13
1844	adc	%rax, %r14
1845	adc	\$0, %r15
1846
1847	xor	$zero, $zero		# cf=0,of=0
1848	jmp	.Loop_mulx
1849
1850.align	32
1851.Loop_mulx:
1852	movq	%r8, %rbx
1853	mulx	($ap), %rax, %r8
1854	adcx	%rax, %rbx
1855	adox	%r9, %r8
1856
1857	mulx	8($ap), %rax, %r9
1858	adcx	%rax, %r8
1859	adox	%r10, %r9
1860
1861	mulx	16($ap), %rax, %r10
1862	adcx	%rax, %r9
1863	adox	%r11, %r10
1864
1865	mulx	24($ap), %rax, %r11
1866	adcx	%rax, %r10
1867	adox	%r12, %r11
1868
1869	.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rax, %r12
1870	adcx	%rax, %r11
1871	adox	%r13, %r12
1872
1873	mulx	40($ap), %rax, %r13
1874	adcx	%rax, %r12
1875	adox	%r14, %r13
1876
1877	mulx	48($ap), %rax, %r14
1878	adcx	%rax, %r13
1879	adox	%r15, %r14
1880
1881	mulx	56($ap), %rax, %r15
1882	 movq	64($bp,%rcx,8), %rdx
1883	 movq	%rbx, 8+64-8(%rsp,%rcx,8)
1884	adcx	%rax, %r14
1885	adox	$zero, %r15
1886	adcx	$zero, %r15		# cf=0
1887
1888	inc	%rcx			# of=0
1889	jnz	.Loop_mulx
1890
1891	movq	%r8, %rbx
1892	mulx	($ap), %rax, %r8
1893	adcx	%rax, %rbx
1894	adox	%r9, %r8
1895
1896	.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00	# mulx	8($ap), %rax, %r9
1897	adcx	%rax, %r8
1898	adox	%r10, %r9
1899
1900	.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00	# mulx	16($ap), %rax, %r10
1901	adcx	%rax, %r9
1902	adox	%r11, %r10
1903
1904	mulx	24($ap), %rax, %r11
1905	adcx	%rax, %r10
1906	adox	%r12, %r11
1907
1908	mulx	32($ap), %rax, %r12
1909	adcx	%rax, %r11
1910	adox	%r13, %r12
1911
1912	mulx	40($ap), %rax, %r13
1913	adcx	%rax, %r12
1914	adox	%r14, %r13
1915
1916	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
1917	adcx	%rax, %r13
1918	adox	%r15, %r14
1919
1920	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($ap), %rax, %r15
1921	adcx	%rax, %r14
1922	adox	$zero, %r15
1923	adcx	$zero, %r15
1924
1925	mov	%rbx, 8+64-8(%rsp)
1926	mov	%r8, 8+64(%rsp)
1927	mov	%r9, 8+64+8(%rsp)
1928	mov	%r10, 8+64+16(%rsp)
1929	mov	%r11, 8+64+24(%rsp)
1930	mov	%r12, 8+64+32(%rsp)
1931	mov	%r13, 8+64+40(%rsp)
1932	mov	%r14, 8+64+48(%rsp)
1933	mov	%r15, 8+64+56(%rsp)
1934
1935	ret
1936.size	__rsaz_512_mulx,.-__rsaz_512_mulx
1937___
1938}
1939{
1940my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1941$code.=<<___;
1942.globl	rsaz_512_scatter4
1943.type	rsaz_512_scatter4,\@abi-omnipotent
1944.align	16
1945rsaz_512_scatter4:
1946	leaq	($out,$power,4), $out
1947	movl	\$8, %r9d
1948	jmp	.Loop_scatter
1949.align	16
1950.Loop_scatter:
1951	movq	($inp), %rax
1952	leaq	8($inp), $inp
1953	movl	%eax, ($out)
1954	shrq	\$32, %rax
1955	movl	%eax, 64($out)
1956	leaq	128($out), $out
1957	decl	%r9d
1958	jnz	.Loop_scatter
1959	ret
1960.size	rsaz_512_scatter4,.-rsaz_512_scatter4
1961
1962.globl	rsaz_512_gather4
1963.type	rsaz_512_gather4,\@abi-omnipotent
1964.align	16
1965rsaz_512_gather4:
1966	leaq	($inp,$power,4), $inp
1967	movl	\$8, %r9d
1968	jmp	.Loop_gather
1969.align	16
1970.Loop_gather:
1971	movl	($inp), %eax
1972	movl	64($inp), %r8d
1973	leaq	128($inp), $inp
1974	shlq	\$32, %r8
1975	or	%r8, %rax
1976	movq	%rax, ($out)
1977	leaq	8($out), $out
1978	decl	%r9d
1979	jnz	.Loop_gather
1980	ret
1981.size	rsaz_512_gather4,.-rsaz_512_gather4
1982___
1983}
1984
1985# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1986#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1987if ($win64) {
1988$rec="%rcx";
1989$frame="%rdx";
1990$context="%r8";
1991$disp="%r9";
1992
1993$code.=<<___;
1994.extern	__imp_RtlVirtualUnwind
1995.type	se_handler,\@abi-omnipotent
1996.align	16
1997se_handler:
1998	push	%rsi
1999	push	%rdi
2000	push	%rbx
2001	push	%rbp
2002	push	%r12
2003	push	%r13
2004	push	%r14
2005	push	%r15
2006	pushfq
2007	sub	\$64,%rsp
2008
2009	mov	120($context),%rax	# pull context->Rax
2010	mov	248($context),%rbx	# pull context->Rip
2011
2012	mov	8($disp),%rsi		# disp->ImageBase
2013	mov	56($disp),%r11		# disp->HandlerData
2014
2015	mov	0(%r11),%r10d		# HandlerData[0]
2016	lea	(%rsi,%r10),%r10	# end of prologue label
2017	cmp	%r10,%rbx		# context->Rip<end of prologue label
2018	jb	.Lcommon_seh_tail
2019
2020	mov	152($context),%rax	# pull context->Rsp
2021
2022	mov	4(%r11),%r10d		# HandlerData[1]
2023	lea	(%rsi,%r10),%r10	# epilogue label
2024	cmp	%r10,%rbx		# context->Rip>=epilogue label
2025	jae	.Lcommon_seh_tail
2026
2027	lea	128+24+48(%rax),%rax
2028
2029	mov	-8(%rax),%rbx
2030	mov	-16(%rax),%rbp
2031	mov	-24(%rax),%r12
2032	mov	-32(%rax),%r13
2033	mov	-40(%rax),%r14
2034	mov	-48(%rax),%r15
2035	mov	%rbx,144($context)	# restore context->Rbx
2036	mov	%rbp,160($context)	# restore context->Rbp
2037	mov	%r12,216($context)	# restore context->R12
2038	mov	%r13,224($context)	# restore context->R13
2039	mov	%r14,232($context)	# restore context->R14
2040	mov	%r15,240($context)	# restore context->R15
2041
2042.Lcommon_seh_tail:
2043	mov	8(%rax),%rdi
2044	mov	16(%rax),%rsi
2045	mov	%rax,152($context)	# restore context->Rsp
2046	mov	%rsi,168($context)	# restore context->Rsi
2047	mov	%rdi,176($context)	# restore context->Rdi
2048
2049	mov	40($disp),%rdi		# disp->ContextRecord
2050	mov	$context,%rsi		# context
2051	mov	\$154,%ecx		# sizeof(CONTEXT)
2052	.long	0xa548f3fc		# cld; rep movsq
2053
2054	mov	$disp,%rsi
2055	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2056	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2057	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2058	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2059	mov	40(%rsi),%r10		# disp->ContextRecord
2060	lea	56(%rsi),%r11		# &disp->HandlerData
2061	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2062	mov	%r10,32(%rsp)		# arg5
2063	mov	%r11,40(%rsp)		# arg6
2064	mov	%r12,48(%rsp)		# arg7
2065	mov	%rcx,56(%rsp)		# arg8, (NULL)
2066	call	*__imp_RtlVirtualUnwind(%rip)
2067
2068	mov	\$1,%eax		# ExceptionContinueSearch
2069	add	\$64,%rsp
2070	popfq
2071	pop	%r15
2072	pop	%r14
2073	pop	%r13
2074	pop	%r12
2075	pop	%rbp
2076	pop	%rbx
2077	pop	%rdi
2078	pop	%rsi
2079	ret
2080.size	sqr_handler,.-sqr_handler
2081
2082.section	.pdata
2083.align	4
2084	.rva	.LSEH_begin_rsaz_512_sqr
2085	.rva	.LSEH_end_rsaz_512_sqr
2086	.rva	.LSEH_info_rsaz_512_sqr
2087
2088	.rva	.LSEH_begin_rsaz_512_mul
2089	.rva	.LSEH_end_rsaz_512_mul
2090	.rva	.LSEH_info_rsaz_512_mul
2091
2092	.rva	.LSEH_begin_rsaz_512_mul_gather4
2093	.rva	.LSEH_end_rsaz_512_mul_gather4
2094	.rva	.LSEH_info_rsaz_512_mul_gather4
2095
2096	.rva	.LSEH_begin_rsaz_512_mul_scatter4
2097	.rva	.LSEH_end_rsaz_512_mul_scatter4
2098	.rva	.LSEH_info_rsaz_512_mul_scatter4
2099
2100	.rva	.LSEH_begin_rsaz_512_mul_by_one
2101	.rva	.LSEH_end_rsaz_512_mul_by_one
2102	.rva	.LSEH_info_rsaz_512_mul_by_one
2103
2104.section	.xdata
2105.align	8
2106.LSEH_info_rsaz_512_sqr:
2107	.byte	9,0,0,0
2108	.rva	se_handler
2109	.rva	.Lsqr_body,.Lsqr_epilogue			# HandlerData[]
2110.LSEH_info_rsaz_512_mul:
2111	.byte	9,0,0,0
2112	.rva	se_handler
2113	.rva	.Lmul_body,.Lmul_epilogue			# HandlerData[]
2114.LSEH_info_rsaz_512_mul_gather4:
2115	.byte	9,0,0,0
2116	.rva	se_handler
2117	.rva	.Lmul_gather4_body,.Lmul_gather4_epilogue	# HandlerData[]
2118.LSEH_info_rsaz_512_mul_scatter4:
2119	.byte	9,0,0,0
2120	.rva	se_handler
2121	.rva	.Lmul_scatter4_body,.Lmul_scatter4_epilogue	# HandlerData[]
2122.LSEH_info_rsaz_512_mul_by_one:
2123	.byte	9,0,0,0
2124	.rva	se_handler
2125	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
2126___
2127}
2128
2129$code =~ s/\`([^\`]*)\`/eval $1/gem;
2130print $code;
2131close STDOUT;
2132