1#!/usr/bin/env perl
2
3# Copyright (c) 2014, Intel Corporation.
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17# Developers and authors:
18# Shay Gueron (1, 2), and Vlad Krasnov (1)
19# (1) Intel Corporation, Israel Development Center
20# (2) University of Haifa
21
22#  Reference:
23#  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
24#                           256 Bit Primes"
25
26# Further optimization by <appro@openssl.org>:
27#
28#		this/original
29# Opteron	+12-49%
30# Bulldozer	+14-45%
31# P4		+18-46%
32# Westmere	+12-34%
33# Sandy Bridge	+9-35%
34# Ivy Bridge	+9-35%
35# Haswell	+8-37%
36# Broadwell	+18-58%
37# Atom		+15-50%
38# VIA Nano	+43-160%
39#
40# Ranges denote minimum and maximum improvement coefficients depending
41# on benchmark.
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" $xlate $flavour $output";
55*STDOUT=*OUT;
56
57# TODO: enable these after testing. $avx goes to two and $addx to one.
58$avx=0;
59$addx=0;
60
61$code.=<<___;
62.text
63.extern	OPENSSL_ia32cap_P
64
65# The polynomial
66.align 64
67.Lpoly:
68.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
69
70.LOne:
71.long 1,1,1,1,1,1,1,1
72.LTwo:
73.long 2,2,2,2,2,2,2,2
74.LThree:
75.long 3,3,3,3,3,3,3,3
76.LONE_mont:
77.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
78___
79
80{
81################################################################################
82# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
83
84my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
85my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
86my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
87
88$code.=<<___;
89
90.type	ecp_nistz256_mul_by_2,\@function,2
91.align	64
92ecp_nistz256_mul_by_2:
93	push	%r12
94	push	%r13
95
96	mov	8*0($a_ptr), $a0
97	mov	8*1($a_ptr), $a1
98	add	$a0, $a0		# a0:a3+a0:a3
99	mov	8*2($a_ptr), $a2
100	adc	$a1, $a1
101	mov	8*3($a_ptr), $a3
102	lea	.Lpoly(%rip), $a_ptr
103	 mov	$a0, $t0
104	adc	$a2, $a2
105	adc	$a3, $a3
106	 mov	$a1, $t1
107	sbb	$t4, $t4
108
109	sub	8*0($a_ptr), $a0
110	 mov	$a2, $t2
111	sbb	8*1($a_ptr), $a1
112	sbb	8*2($a_ptr), $a2
113	 mov	$a3, $t3
114	sbb	8*3($a_ptr), $a3
115	test	$t4, $t4
116
117	cmovz	$t0, $a0
118	cmovz	$t1, $a1
119	mov	$a0, 8*0($r_ptr)
120	cmovz	$t2, $a2
121	mov	$a1, 8*1($r_ptr)
122	cmovz	$t3, $a3
123	mov	$a2, 8*2($r_ptr)
124	mov	$a3, 8*3($r_ptr)
125
126	pop	%r13
127	pop	%r12
128	ret
129.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
130
131################################################################################
132# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
133.globl	ecp_nistz256_neg
134.type	ecp_nistz256_neg,\@function,2
135.align	32
136ecp_nistz256_neg:
137	push	%r12
138	push	%r13
139
140	xor	$a0, $a0
141	xor	$a1, $a1
142	xor	$a2, $a2
143	xor	$a3, $a3
144	xor	$t4, $t4
145
146	sub	8*0($a_ptr), $a0
147	sbb	8*1($a_ptr), $a1
148	sbb	8*2($a_ptr), $a2
149	 mov	$a0, $t0
150	sbb	8*3($a_ptr), $a3
151	lea	.Lpoly(%rip), $a_ptr
152	 mov	$a1, $t1
153	sbb	\$0, $t4
154
155	add	8*0($a_ptr), $a0
156	 mov	$a2, $t2
157	adc	8*1($a_ptr), $a1
158	adc	8*2($a_ptr), $a2
159	 mov	$a3, $t3
160	adc	8*3($a_ptr), $a3
161	test	$t4, $t4
162
163	cmovz	$t0, $a0
164	cmovz	$t1, $a1
165	mov	$a0, 8*0($r_ptr)
166	cmovz	$t2, $a2
167	mov	$a1, 8*1($r_ptr)
168	cmovz	$t3, $a3
169	mov	$a2, 8*2($r_ptr)
170	mov	$a3, 8*3($r_ptr)
171
172	pop %r13
173	pop %r12
174	ret
175.size	ecp_nistz256_neg,.-ecp_nistz256_neg
176___
177}
178{
179my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
180my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
181my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
182my ($poly1,$poly3)=($acc6,$acc7);
183
184$code.=<<___;
185################################################################################
186# void ecp_nistz256_mul_mont(
187#   uint64_t res[4],
188#   uint64_t a[4],
189#   uint64_t b[4]);
190
191.globl	ecp_nistz256_mul_mont
192.type	ecp_nistz256_mul_mont,\@function,3
193.align	32
194ecp_nistz256_mul_mont:
195___
196$code.=<<___	if ($addx);
197	mov	\$0x80100, %ecx
198	and	OPENSSL_ia32cap_P+8(%rip), %ecx
199___
200$code.=<<___;
201.Lmul_mont:
202	push	%rbp
203	push	%rbx
204	push	%r12
205	push	%r13
206	push	%r14
207	push	%r15
208___
209$code.=<<___	if ($addx);
210	cmp	\$0x80100, %ecx
211	je	.Lmul_montx
212___
213$code.=<<___;
214	mov	$b_org, $b_ptr
215	mov	8*0($b_org), %rax
216	mov	8*0($a_ptr), $acc1
217	mov	8*1($a_ptr), $acc2
218	mov	8*2($a_ptr), $acc3
219	mov	8*3($a_ptr), $acc4
220
221	call	__ecp_nistz256_mul_montq
222___
223$code.=<<___	if ($addx);
224	jmp	.Lmul_mont_done
225
226.align	32
227.Lmul_montx:
228	mov	$b_org, $b_ptr
229	mov	8*0($b_org), %rdx
230	mov	8*0($a_ptr), $acc1
231	mov	8*1($a_ptr), $acc2
232	mov	8*2($a_ptr), $acc3
233	mov	8*3($a_ptr), $acc4
234	lea	-128($a_ptr), $a_ptr	# control u-op density
235
236	call	__ecp_nistz256_mul_montx
237___
238$code.=<<___;
239.Lmul_mont_done:
240	pop	%r15
241	pop	%r14
242	pop	%r13
243	pop	%r12
244	pop	%rbx
245	pop	%rbp
246	ret
247.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
248
249.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
250.align	32
251__ecp_nistz256_mul_montq:
252	########################################################################
253	# Multiply a by b[0]
254	mov	%rax, $t1
255	mulq	$acc1
256	mov	.Lpoly+8*1(%rip),$poly1
257	mov	%rax, $acc0
258	mov	$t1, %rax
259	mov	%rdx, $acc1
260
261	mulq	$acc2
262	mov	.Lpoly+8*3(%rip),$poly3
263	add	%rax, $acc1
264	mov	$t1, %rax
265	adc	\$0, %rdx
266	mov	%rdx, $acc2
267
268	mulq	$acc3
269	add	%rax, $acc2
270	mov	$t1, %rax
271	adc	\$0, %rdx
272	mov	%rdx, $acc3
273
274	mulq	$acc4
275	add	%rax, $acc3
276	 mov	$acc0, %rax
277	adc	\$0, %rdx
278	xor	$acc5, $acc5
279	mov	%rdx, $acc4
280
281	########################################################################
282	# First reduction step
283	# Basically now we want to multiply acc[0] by p256,
284	# and add the result to the acc.
285	# Due to the special form of p256 we do some optimizations
286	#
287	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
288	# then we add acc[0] and get acc[0] x 2^96
289
290	mov	$acc0, $t1
291	shl	\$32, $acc0
292	mulq	$poly3
293	shr	\$32, $t1
294	add	$acc0, $acc1		# +=acc[0]<<96
295	adc	$t1, $acc2
296	adc	%rax, $acc3
297	 mov	8*1($b_ptr), %rax
298	adc	%rdx, $acc4
299	adc	\$0, $acc5
300	xor	$acc0, $acc0
301
302	########################################################################
303	# Multiply by b[1]
304	mov	%rax, $t1
305	mulq	8*0($a_ptr)
306	add	%rax, $acc1
307	mov	$t1, %rax
308	adc	\$0, %rdx
309	mov	%rdx, $t0
310
311	mulq	8*1($a_ptr)
312	add	$t0, $acc2
313	adc	\$0, %rdx
314	add	%rax, $acc2
315	mov	$t1, %rax
316	adc	\$0, %rdx
317	mov	%rdx, $t0
318
319	mulq	8*2($a_ptr)
320	add	$t0, $acc3
321	adc	\$0, %rdx
322	add	%rax, $acc3
323	mov	$t1, %rax
324	adc	\$0, %rdx
325	mov	%rdx, $t0
326
327	mulq	8*3($a_ptr)
328	add	$t0, $acc4
329	adc	\$0, %rdx
330	add	%rax, $acc4
331	 mov	$acc1, %rax
332	adc	%rdx, $acc5
333	adc	\$0, $acc0
334
335	########################################################################
336	# Second reduction step
337	mov	$acc1, $t1
338	shl	\$32, $acc1
339	mulq	$poly3
340	shr	\$32, $t1
341	add	$acc1, $acc2
342	adc	$t1, $acc3
343	adc	%rax, $acc4
344	 mov	8*2($b_ptr), %rax
345	adc	%rdx, $acc5
346	adc	\$0, $acc0
347	xor	$acc1, $acc1
348
349	########################################################################
350	# Multiply by b[2]
351	mov	%rax, $t1
352	mulq	8*0($a_ptr)
353	add	%rax, $acc2
354	mov	$t1, %rax
355	adc	\$0, %rdx
356	mov	%rdx, $t0
357
358	mulq	8*1($a_ptr)
359	add	$t0, $acc3
360	adc	\$0, %rdx
361	add	%rax, $acc3
362	mov	$t1, %rax
363	adc	\$0, %rdx
364	mov	%rdx, $t0
365
366	mulq	8*2($a_ptr)
367	add	$t0, $acc4
368	adc	\$0, %rdx
369	add	%rax, $acc4
370	mov	$t1, %rax
371	adc	\$0, %rdx
372	mov	%rdx, $t0
373
374	mulq	8*3($a_ptr)
375	add	$t0, $acc5
376	adc	\$0, %rdx
377	add	%rax, $acc5
378	 mov	$acc2, %rax
379	adc	%rdx, $acc0
380	adc	\$0, $acc1
381
382	########################################################################
383	# Third reduction step
384	mov	$acc2, $t1
385	shl	\$32, $acc2
386	mulq	$poly3
387	shr	\$32, $t1
388	add	$acc2, $acc3
389	adc	$t1, $acc4
390	adc	%rax, $acc5
391	 mov	8*3($b_ptr), %rax
392	adc	%rdx, $acc0
393	adc	\$0, $acc1
394	xor	$acc2, $acc2
395
396	########################################################################
397	# Multiply by b[3]
398	mov	%rax, $t1
399	mulq	8*0($a_ptr)
400	add	%rax, $acc3
401	mov	$t1, %rax
402	adc	\$0, %rdx
403	mov	%rdx, $t0
404
405	mulq	8*1($a_ptr)
406	add	$t0, $acc4
407	adc	\$0, %rdx
408	add	%rax, $acc4
409	mov	$t1, %rax
410	adc	\$0, %rdx
411	mov	%rdx, $t0
412
413	mulq	8*2($a_ptr)
414	add	$t0, $acc5
415	adc	\$0, %rdx
416	add	%rax, $acc5
417	mov	$t1, %rax
418	adc	\$0, %rdx
419	mov	%rdx, $t0
420
421	mulq	8*3($a_ptr)
422	add	$t0, $acc0
423	adc	\$0, %rdx
424	add	%rax, $acc0
425	 mov	$acc3, %rax
426	adc	%rdx, $acc1
427	adc	\$0, $acc2
428
429	########################################################################
430	# Final reduction step
431	mov	$acc3, $t1
432	shl	\$32, $acc3
433	mulq	$poly3
434	shr	\$32, $t1
435	add	$acc3, $acc4
436	adc	$t1, $acc5
437	 mov	$acc4, $t0
438	adc	%rax, $acc0
439	adc	%rdx, $acc1
440	 mov	$acc5, $t1
441	adc	\$0, $acc2
442
443	########################################################################
444	# Branch-less conditional subtraction of P
445	sub	\$-1, $acc4		# .Lpoly[0]
446	 mov	$acc0, $t2
447	sbb	$poly1, $acc5		# .Lpoly[1]
448	sbb	\$0, $acc0		# .Lpoly[2]
449	 mov	$acc1, $t3
450	sbb	$poly3, $acc1		# .Lpoly[3]
451	sbb	\$0, $acc2
452
453	cmovc	$t0, $acc4
454	cmovc	$t1, $acc5
455	mov	$acc4, 8*0($r_ptr)
456	cmovc	$t2, $acc0
457	mov	$acc5, 8*1($r_ptr)
458	cmovc	$t3, $acc1
459	mov	$acc0, 8*2($r_ptr)
460	mov	$acc1, 8*3($r_ptr)
461
462	ret
463.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
464
465################################################################################
466# void ecp_nistz256_sqr_mont(
467#   uint64_t res[4],
468#   uint64_t a[4]);
469
470# we optimize the square according to S.Gueron and V.Krasnov,
471# "Speeding up Big-Number Squaring"
472.globl	ecp_nistz256_sqr_mont
473.type	ecp_nistz256_sqr_mont,\@function,2
474.align	32
475ecp_nistz256_sqr_mont:
476___
477$code.=<<___	if ($addx);
478	mov	\$0x80100, %ecx
479	and	OPENSSL_ia32cap_P+8(%rip), %ecx
480___
481$code.=<<___;
482	push	%rbp
483	push	%rbx
484	push	%r12
485	push	%r13
486	push	%r14
487	push	%r15
488___
489$code.=<<___	if ($addx);
490	cmp	\$0x80100, %ecx
491	je	.Lsqr_montx
492___
493$code.=<<___;
494	mov	8*0($a_ptr), %rax
495	mov	8*1($a_ptr), $acc6
496	mov	8*2($a_ptr), $acc7
497	mov	8*3($a_ptr), $acc0
498
499	call	__ecp_nistz256_sqr_montq
500___
501$code.=<<___	if ($addx);
502	jmp	.Lsqr_mont_done
503
504.align	32
505.Lsqr_montx:
506	mov	8*0($a_ptr), %rdx
507	mov	8*1($a_ptr), $acc6
508	mov	8*2($a_ptr), $acc7
509	mov	8*3($a_ptr), $acc0
510	lea	-128($a_ptr), $a_ptr	# control u-op density
511
512	call	__ecp_nistz256_sqr_montx
513___
514$code.=<<___;
515.Lsqr_mont_done:
516	pop	%r15
517	pop	%r14
518	pop	%r13
519	pop	%r12
520	pop	%rbx
521	pop	%rbp
522	ret
523.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
524
525.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
526.align	32
527__ecp_nistz256_sqr_montq:
528	mov	%rax, $acc5
529	mulq	$acc6			# a[1]*a[0]
530	mov	%rax, $acc1
531	mov	$acc7, %rax
532	mov	%rdx, $acc2
533
534	mulq	$acc5			# a[0]*a[2]
535	add	%rax, $acc2
536	mov	$acc0, %rax
537	adc	\$0, %rdx
538	mov	%rdx, $acc3
539
540	mulq	$acc5			# a[0]*a[3]
541	add	%rax, $acc3
542	 mov	$acc7, %rax
543	adc	\$0, %rdx
544	mov	%rdx, $acc4
545
546	#################################
547	mulq	$acc6			# a[1]*a[2]
548	add	%rax, $acc3
549	mov	$acc0, %rax
550	adc	\$0, %rdx
551	mov	%rdx, $t1
552
553	mulq	$acc6			# a[1]*a[3]
554	add	%rax, $acc4
555	 mov	$acc0, %rax
556	adc	\$0, %rdx
557	add	$t1, $acc4
558	mov	%rdx, $acc5
559	adc	\$0, $acc5
560
561	#################################
562	mulq	$acc7			# a[2]*a[3]
563	xor	$acc7, $acc7
564	add	%rax, $acc5
565	 mov	8*0($a_ptr), %rax
566	mov	%rdx, $acc6
567	adc	\$0, $acc6
568
569	add	$acc1, $acc1		# acc1:6<<1
570	adc	$acc2, $acc2
571	adc	$acc3, $acc3
572	adc	$acc4, $acc4
573	adc	$acc5, $acc5
574	adc	$acc6, $acc6
575	adc	\$0, $acc7
576
577	mulq	%rax
578	mov	%rax, $acc0
579	mov	8*1($a_ptr), %rax
580	mov	%rdx, $t0
581
582	mulq	%rax
583	add	$t0, $acc1
584	adc	%rax, $acc2
585	mov	8*2($a_ptr), %rax
586	adc	\$0, %rdx
587	mov	%rdx, $t0
588
589	mulq	%rax
590	add	$t0, $acc3
591	adc	%rax, $acc4
592	mov	8*3($a_ptr), %rax
593	adc	\$0, %rdx
594	mov	%rdx, $t0
595
596	mulq	%rax
597	add	$t0, $acc5
598	adc	%rax, $acc6
599	 mov	$acc0, %rax
600	adc	%rdx, $acc7
601
602	mov	.Lpoly+8*1(%rip), $a_ptr
603	mov	.Lpoly+8*3(%rip), $t1
604
605	##########################################
606	# Now the reduction
607	# First iteration
608	mov	$acc0, $t0
609	shl	\$32, $acc0
610	mulq	$t1
611	shr	\$32, $t0
612	add	$acc0, $acc1		# +=acc[0]<<96
613	adc	$t0, $acc2
614	adc	%rax, $acc3
615	 mov	$acc1, %rax
616	adc	\$0, %rdx
617
618	##########################################
619	# Second iteration
620	mov	$acc1, $t0
621	shl	\$32, $acc1
622	mov	%rdx, $acc0
623	mulq	$t1
624	shr	\$32, $t0
625	add	$acc1, $acc2
626	adc	$t0, $acc3
627	adc	%rax, $acc0
628	 mov	$acc2, %rax
629	adc	\$0, %rdx
630
631	##########################################
632	# Third iteration
633	mov	$acc2, $t0
634	shl	\$32, $acc2
635	mov	%rdx, $acc1
636	mulq	$t1
637	shr	\$32, $t0
638	add	$acc2, $acc3
639	adc	$t0, $acc0
640	adc	%rax, $acc1
641	 mov	$acc3, %rax
642	adc	\$0, %rdx
643
644	###########################################
645	# Last iteration
646	mov	$acc3, $t0
647	shl	\$32, $acc3
648	mov	%rdx, $acc2
649	mulq	$t1
650	shr	\$32, $t0
651	add	$acc3, $acc0
652	adc	$t0, $acc1
653	adc	%rax, $acc2
654	adc	\$0, %rdx
655	xor	$acc3, $acc3
656
657	############################################
658	# Add the rest of the acc
659	add	$acc0, $acc4
660	adc	$acc1, $acc5
661	 mov	$acc4, $acc0
662	adc	$acc2, $acc6
663	adc	%rdx, $acc7
664	 mov	$acc5, $acc1
665	adc	\$0, $acc3
666
667	sub	\$-1, $acc4		# .Lpoly[0]
668	 mov	$acc6, $acc2
669	sbb	$a_ptr, $acc5		# .Lpoly[1]
670	sbb	\$0, $acc6		# .Lpoly[2]
671	 mov	$acc7, $t0
672	sbb	$t1, $acc7		# .Lpoly[3]
673	sbb	\$0, $acc3
674
675	cmovc	$acc0, $acc4
676	cmovc	$acc1, $acc5
677	mov	$acc4, 8*0($r_ptr)
678	cmovc	$acc2, $acc6
679	mov	$acc5, 8*1($r_ptr)
680	cmovc	$t0, $acc7
681	mov	$acc6, 8*2($r_ptr)
682	mov	$acc7, 8*3($r_ptr)
683
684	ret
685.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
686___
687
688if ($addx) {
689$code.=<<___;
690.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
691.align	32
692__ecp_nistz256_mul_montx:
693	########################################################################
694	# Multiply by b[0]
695	mulx	$acc1, $acc0, $acc1
696	mulx	$acc2, $t0, $acc2
697	mov	\$32, $poly1
698	xor	$acc5, $acc5		# cf=0
699	mulx	$acc3, $t1, $acc3
700	mov	.Lpoly+8*3(%rip), $poly3
701	adc	$t0, $acc1
702	mulx	$acc4, $t0, $acc4
703	 mov	$acc0, %rdx
704	adc	$t1, $acc2
705	 shlx	$poly1,$acc0,$t1
706	adc	$t0, $acc3
707	 shrx	$poly1,$acc0,$t0
708	adc	\$0, $acc4
709
710	########################################################################
711	# First reduction step
712	add	$t1, $acc1
713	adc	$t0, $acc2
714
715	mulx	$poly3, $t0, $t1
716	 mov	8*1($b_ptr), %rdx
717	adc	$t0, $acc3
718	adc	$t1, $acc4
719	adc	\$0, $acc5
720	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
721
722	########################################################################
723	# Multiply by b[1]
724	mulx	8*0+128($a_ptr), $t0, $t1
725	adcx	$t0, $acc1
726	adox	$t1, $acc2
727
728	mulx	8*1+128($a_ptr), $t0, $t1
729	adcx	$t0, $acc2
730	adox	$t1, $acc3
731
732	mulx	8*2+128($a_ptr), $t0, $t1
733	adcx	$t0, $acc3
734	adox	$t1, $acc4
735
736	mulx	8*3+128($a_ptr), $t0, $t1
737	 mov	$acc1, %rdx
738	adcx	$t0, $acc4
739	 shlx	$poly1, $acc1, $t0
740	adox	$t1, $acc5
741	 shrx	$poly1, $acc1, $t1
742
743	adcx	$acc0, $acc5
744	adox	$acc0, $acc0
745	adc	\$0, $acc0
746
747	########################################################################
748	# Second reduction step
749	add	$t0, $acc2
750	adc	$t1, $acc3
751
752	mulx	$poly3, $t0, $t1
753	 mov	8*2($b_ptr), %rdx
754	adc	$t0, $acc4
755	adc	$t1, $acc5
756	adc	\$0, $acc0
757	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
758
759	########################################################################
760	# Multiply by b[2]
761	mulx	8*0+128($a_ptr), $t0, $t1
762	adcx	$t0, $acc2
763	adox	$t1, $acc3
764
765	mulx	8*1+128($a_ptr), $t0, $t1
766	adcx	$t0, $acc3
767	adox	$t1, $acc4
768
769	mulx	8*2+128($a_ptr), $t0, $t1
770	adcx	$t0, $acc4
771	adox	$t1, $acc5
772
773	mulx	8*3+128($a_ptr), $t0, $t1
774	 mov	$acc2, %rdx
775	adcx	$t0, $acc5
776	 shlx	$poly1, $acc2, $t0
777	adox	$t1, $acc0
778	 shrx	$poly1, $acc2, $t1
779
780	adcx	$acc1, $acc0
781	adox	$acc1, $acc1
782	adc	\$0, $acc1
783
784	########################################################################
785	# Third reduction step
786	add	$t0, $acc3
787	adc	$t1, $acc4
788
789	mulx	$poly3, $t0, $t1
790	 mov	8*3($b_ptr), %rdx
791	adc	$t0, $acc5
792	adc	$t1, $acc0
793	adc	\$0, $acc1
794	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
795
796	########################################################################
797	# Multiply by b[3]
798	mulx	8*0+128($a_ptr), $t0, $t1
799	adcx	$t0, $acc3
800	adox	$t1, $acc4
801
802	mulx	8*1+128($a_ptr), $t0, $t1
803	adcx	$t0, $acc4
804	adox	$t1, $acc5
805
806	mulx	8*2+128($a_ptr), $t0, $t1
807	adcx	$t0, $acc5
808	adox	$t1, $acc0
809
810	mulx	8*3+128($a_ptr), $t0, $t1
811	 mov	$acc3, %rdx
812	adcx	$t0, $acc0
813	 shlx	$poly1, $acc3, $t0
814	adox	$t1, $acc1
815	 shrx	$poly1, $acc3, $t1
816
817	adcx	$acc2, $acc1
818	adox	$acc2, $acc2
819	adc	\$0, $acc2
820
821	########################################################################
822	# Fourth reduction step
823	add	$t0, $acc4
824	adc	$t1, $acc5
825
826	mulx	$poly3, $t0, $t1
827	 mov	$acc4, $t2
828	mov	.Lpoly+8*1(%rip), $poly1
829	adc	$t0, $acc0
830	 mov	$acc5, $t3
831	adc	$t1, $acc1
832	adc	\$0, $acc2
833
834	########################################################################
835	# Branch-less conditional subtraction of P
836	xor	%eax, %eax
837	 mov	$acc0, $t0
838	sbb	\$-1, $acc4		# .Lpoly[0]
839	sbb	$poly1, $acc5		# .Lpoly[1]
840	sbb	\$0, $acc0		# .Lpoly[2]
841	 mov	$acc1, $t1
842	sbb	$poly3, $acc1		# .Lpoly[3]
843	sbb	\$0, $acc2
844
845	cmovc	$t2, $acc4
846	cmovc	$t3, $acc5
847	mov	$acc4, 8*0($r_ptr)
848	cmovc	$t0, $acc0
849	mov	$acc5, 8*1($r_ptr)
850	cmovc	$t1, $acc1
851	mov	$acc0, 8*2($r_ptr)
852	mov	$acc1, 8*3($r_ptr)
853
854	ret
855.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
856
857.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
858.align	32
859__ecp_nistz256_sqr_montx:
860	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
861	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
862	xor	%eax, %eax
863	adc	$t0, $acc2
864	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
865	 mov	$acc6, %rdx
866	adc	$t1, $acc3
867	adc	\$0, $acc4
868	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
869
870	#################################
871	mulx	$acc7, $t0, $t1		# a[1]*a[2]
872	adcx	$t0, $acc3
873	adox	$t1, $acc4
874
875	mulx	$acc0, $t0, $t1		# a[1]*a[3]
876	 mov	$acc7, %rdx
877	adcx	$t0, $acc4
878	adox	$t1, $acc5
879	adc	\$0, $acc5
880
881	#################################
882	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
883	 mov	8*0+128($a_ptr), %rdx
884	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
885	 adcx	$acc1, $acc1		# acc1:6<<1
886	adox	$t0, $acc5
887	 adcx	$acc2, $acc2
888	adox	$acc7, $acc6		# of=0
889
890	mulx	%rdx, $acc0, $t1
891	mov	8*1+128($a_ptr), %rdx
892	 adcx	$acc3, $acc3
893	adox	$t1, $acc1
894	 adcx	$acc4, $acc4
895	mulx	%rdx, $t0, $t4
896	mov	8*2+128($a_ptr), %rdx
897	 adcx	$acc5, $acc5
898	adox	$t0, $acc2
899	 adcx	$acc6, $acc6
900	.byte	0x67
901	mulx	%rdx, $t0, $t1
902	mov	8*3+128($a_ptr), %rdx
903	adox	$t4, $acc3
904	 adcx	$acc7, $acc7
905	adox	$t0, $acc4
906	 mov	\$32, $a_ptr
907	adox	$t1, $acc5
908	.byte	0x67,0x67
909	mulx	%rdx, $t0, $t4
910	 mov	$acc0, %rdx
911	adox	$t0, $acc6
912	 shlx	$a_ptr, $acc0, $t0
913	adox	$t4, $acc7
914	 shrx	$a_ptr, $acc0, $t4
915	 mov	.Lpoly+8*3(%rip), $t1
916
917	# reduction step 1
918	add	$t0, $acc1
919	adc	$t4, $acc2
920
921	mulx	$t1, $t0, $acc0
922	 mov	$acc1, %rdx
923	adc	$t0, $acc3
924	 shlx	$a_ptr, $acc1, $t0
925	adc	\$0, $acc0
926	 shrx	$a_ptr, $acc1, $t4
927
928	# reduction step 2
929	add	$t0, $acc2
930	adc	$t4, $acc3
931
932	mulx	$t1, $t0, $acc1
933	 mov	$acc2, %rdx
934	adc	$t0, $acc0
935	 shlx	$a_ptr, $acc2, $t0
936	adc	\$0, $acc1
937	 shrx	$a_ptr, $acc2, $t4
938
939	# reduction step 3
940	add	$t0, $acc3
941	adc	$t4, $acc0
942
943	mulx	$t1, $t0, $acc2
944	 mov	$acc3, %rdx
945	adc	$t0, $acc1
946	 shlx	$a_ptr, $acc3, $t0
947	adc	\$0, $acc2
948	 shrx	$a_ptr, $acc3, $t4
949
950	# reduction step 4
951	add	$t0, $acc0
952	adc	$t4, $acc1
953
954	mulx	$t1, $t0, $acc3
955	adc	$t0, $acc2
956	adc	\$0, $acc3
957
958	xor	$t3, $t3		# cf=0
959	adc	$acc0, $acc4		# accumulate upper half
960	 mov	.Lpoly+8*1(%rip), $a_ptr
961	adc	$acc1, $acc5
962	 mov	$acc4, $acc0
963	adc	$acc2, $acc6
964	adc	$acc3, $acc7
965	 mov	$acc5, $acc1
966	adc	\$0, $t3
967
968	xor	%eax, %eax		# cf=0
969	sbb	\$-1, $acc4		# .Lpoly[0]
970	 mov	$acc6, $acc2
971	sbb	$a_ptr, $acc5		# .Lpoly[1]
972	sbb	\$0, $acc6		# .Lpoly[2]
973	 mov	$acc7, $acc3
974	sbb	$t1, $acc7		# .Lpoly[3]
975	sbb	\$0, $t3
976
977	cmovc	$acc0, $acc4
978	cmovc	$acc1, $acc5
979	mov	$acc4, 8*0($r_ptr)
980	cmovc	$acc2, $acc6
981	mov	$acc5, 8*1($r_ptr)
982	cmovc	$acc3, $acc7
983	mov	$acc6, 8*2($r_ptr)
984	mov	$acc7, 8*3($r_ptr)
985
986	ret
987.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
988___
989}
990}
991{
992my ($r_ptr,$in_ptr)=("%rdi","%rsi");
993my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
994my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
995
996$code.=<<___;
997################################################################################
998# void ecp_nistz256_from_mont(
999#   uint64_t res[4],
1000#   uint64_t in[4]);
1001# This one performs Montgomery multiplication by 1, so we only need the reduction
1002
1003.globl	ecp_nistz256_from_mont
1004.type	ecp_nistz256_from_mont,\@function,2
1005.align	32
1006ecp_nistz256_from_mont:
1007	push	%r12
1008	push	%r13
1009
1010	mov	8*0($in_ptr), %rax
1011	mov	.Lpoly+8*3(%rip), $t2
1012	mov	8*1($in_ptr), $acc1
1013	mov	8*2($in_ptr), $acc2
1014	mov	8*3($in_ptr), $acc3
1015	mov	%rax, $acc0
1016	mov	.Lpoly+8*1(%rip), $t1
1017
1018	#########################################
1019	# First iteration
1020	mov	%rax, $t0
1021	shl	\$32, $acc0
1022	mulq	$t2
1023	shr	\$32, $t0
1024	add	$acc0, $acc1
1025	adc	$t0, $acc2
1026	adc	%rax, $acc3
1027	 mov	$acc1, %rax
1028	adc	\$0, %rdx
1029
1030	#########################################
1031	# Second iteration
1032	mov	$acc1, $t0
1033	shl	\$32, $acc1
1034	mov	%rdx, $acc0
1035	mulq	$t2
1036	shr	\$32, $t0
1037	add	$acc1, $acc2
1038	adc	$t0, $acc3
1039	adc	%rax, $acc0
1040	 mov	$acc2, %rax
1041	adc	\$0, %rdx
1042
1043	##########################################
1044	# Third iteration
1045	mov	$acc2, $t0
1046	shl	\$32, $acc2
1047	mov	%rdx, $acc1
1048	mulq	$t2
1049	shr	\$32, $t0
1050	add	$acc2, $acc3
1051	adc	$t0, $acc0
1052	adc	%rax, $acc1
1053	 mov	$acc3, %rax
1054	adc	\$0, %rdx
1055
1056	###########################################
1057	# Last iteration
1058	mov	$acc3, $t0
1059	shl	\$32, $acc3
1060	mov	%rdx, $acc2
1061	mulq	$t2
1062	shr	\$32, $t0
1063	add	$acc3, $acc0
1064	adc	$t0, $acc1
1065	 mov	$acc0, $t0
1066	adc	%rax, $acc2
1067	 mov	$acc1, $in_ptr
1068	adc	\$0, %rdx
1069
1070	sub	\$-1, $acc0
1071	 mov	$acc2, %rax
1072	sbb	$t1, $acc1
1073	sbb	\$0, $acc2
1074	 mov	%rdx, $acc3
1075	sbb	$t2, %rdx
1076	sbb	$t2, $t2
1077
1078	cmovnz	$t0, $acc0
1079	cmovnz	$in_ptr, $acc1
1080	mov	$acc0, 8*0($r_ptr)
1081	cmovnz	%rax, $acc2
1082	mov	$acc1, 8*1($r_ptr)
1083	cmovz	%rdx, $acc3
1084	mov	$acc2, 8*2($r_ptr)
1085	mov	$acc3, 8*3($r_ptr)
1086
1087	pop	%r13
1088	pop	%r12
1089	ret
1090.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
1091___
1092}
1093{
1094my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1095my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
1096my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
1097my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
1098
1099$code.=<<___;
1100################################################################################
1101# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1102.globl	ecp_nistz256_select_w5
1103.type	ecp_nistz256_select_w5,\@abi-omnipotent
1104.align	32
1105ecp_nistz256_select_w5:
1106___
1107$code.=<<___	if ($avx>1);
1108	mov	OPENSSL_ia32cap_P+8(%rip), %eax
1109	test	\$`1<<5`, %eax
1110	jnz	.Lavx2_select_w5
1111___
1112$code.=<<___	if ($win64);
1113	lea	-0x88(%rsp), %rax
1114.LSEH_begin_ecp_nistz256_select_w5:
1115	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1116	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
1117	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
1118	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
1119	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
1120	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
1121	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
1122	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
1123	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
1124	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
1125	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
1126___
1127$code.=<<___;
1128	movdqa	.LOne(%rip), $ONE
1129	movd	$index, $INDEX
1130
1131	pxor	$Ra, $Ra
1132	pxor	$Rb, $Rb
1133	pxor	$Rc, $Rc
1134	pxor	$Rd, $Rd
1135	pxor	$Re, $Re
1136	pxor	$Rf, $Rf
1137
1138	movdqa	$ONE, $M0
1139	pshufd	\$0, $INDEX, $INDEX
1140
1141	mov	\$16, %rax
1142.Lselect_loop_sse_w5:
1143
1144	movdqa	$M0, $TMP0
1145	paddd	$ONE, $M0
1146	pcmpeqd $INDEX, $TMP0
1147
1148	movdqa	16*0($in_t), $T0a
1149	movdqa	16*1($in_t), $T0b
1150	movdqa	16*2($in_t), $T0c
1151	movdqa	16*3($in_t), $T0d
1152	movdqa	16*4($in_t), $T0e
1153	movdqa	16*5($in_t), $T0f
1154	lea 16*6($in_t), $in_t
1155
1156	pand	$TMP0, $T0a
1157	pand	$TMP0, $T0b
1158	por	$T0a, $Ra
1159	pand	$TMP0, $T0c
1160	por	$T0b, $Rb
1161	pand	$TMP0, $T0d
1162	por	$T0c, $Rc
1163	pand	$TMP0, $T0e
1164	por	$T0d, $Rd
1165	pand	$TMP0, $T0f
1166	por	$T0e, $Re
1167	por	$T0f, $Rf
1168
1169	dec	%rax
1170	jnz	.Lselect_loop_sse_w5
1171
1172	movdqu	$Ra, 16*0($val)
1173	movdqu	$Rb, 16*1($val)
1174	movdqu	$Rc, 16*2($val)
1175	movdqu	$Rd, 16*3($val)
1176	movdqu	$Re, 16*4($val)
1177	movdqu	$Rf, 16*5($val)
1178___
1179$code.=<<___	if ($win64);
1180	movaps	(%rsp), %xmm6
1181	movaps	0x10(%rsp), %xmm7
1182	movaps	0x20(%rsp), %xmm8
1183	movaps	0x30(%rsp), %xmm9
1184	movaps	0x40(%rsp), %xmm10
1185	movaps	0x50(%rsp), %xmm11
1186	movaps	0x60(%rsp), %xmm12
1187	movaps	0x70(%rsp), %xmm13
1188	movaps	0x80(%rsp), %xmm14
1189	movaps	0x90(%rsp), %xmm15
1190	lea	0xa8(%rsp), %rsp
1191.LSEH_end_ecp_nistz256_select_w5:
1192___
1193$code.=<<___;
1194	ret
1195.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1196
1197################################################################################
1198# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1199.globl	ecp_nistz256_select_w7
1200.type	ecp_nistz256_select_w7,\@abi-omnipotent
1201.align	32
1202ecp_nistz256_select_w7:
1203___
1204$code.=<<___	if ($avx>1);
1205	mov	OPENSSL_ia32cap_P+8(%rip), %eax
1206	test	\$`1<<5`, %eax
1207	jnz	.Lavx2_select_w7
1208___
1209$code.=<<___	if ($win64);
1210	lea	-0x88(%rsp), %rax
1211.LSEH_begin_ecp_nistz256_select_w7:
1212	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1213	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
1214	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
1215	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
1216	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
1217	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
1218	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
1219	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
1220	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
1221	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
1222	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
1223___
1224$code.=<<___;
1225	movdqa	.LOne(%rip), $M0
1226	movd	$index, $INDEX
1227
1228	pxor	$Ra, $Ra
1229	pxor	$Rb, $Rb
1230	pxor	$Rc, $Rc
1231	pxor	$Rd, $Rd
1232
1233	movdqa	$M0, $ONE
1234	pshufd	\$0, $INDEX, $INDEX
1235	mov	\$64, %rax
1236
1237.Lselect_loop_sse_w7:
1238	movdqa	$M0, $TMP0
1239	paddd	$ONE, $M0
1240	movdqa	16*0($in_t), $T0a
1241	movdqa	16*1($in_t), $T0b
1242	pcmpeqd	$INDEX, $TMP0
1243	movdqa	16*2($in_t), $T0c
1244	movdqa	16*3($in_t), $T0d
1245	lea	16*4($in_t), $in_t
1246
1247	pand	$TMP0, $T0a
1248	pand	$TMP0, $T0b
1249	por	$T0a, $Ra
1250	pand	$TMP0, $T0c
1251	por	$T0b, $Rb
1252	pand	$TMP0, $T0d
1253	por	$T0c, $Rc
1254	prefetcht0	255($in_t)
1255	por	$T0d, $Rd
1256
1257	dec	%rax
1258	jnz	.Lselect_loop_sse_w7
1259
1260	movdqu	$Ra, 16*0($val)
1261	movdqu	$Rb, 16*1($val)
1262	movdqu	$Rc, 16*2($val)
1263	movdqu	$Rd, 16*3($val)
1264___
1265$code.=<<___	if ($win64);
1266	movaps	(%rsp), %xmm6
1267	movaps	0x10(%rsp), %xmm7
1268	movaps	0x20(%rsp), %xmm8
1269	movaps	0x30(%rsp), %xmm9
1270	movaps	0x40(%rsp), %xmm10
1271	movaps	0x50(%rsp), %xmm11
1272	movaps	0x60(%rsp), %xmm12
1273	movaps	0x70(%rsp), %xmm13
1274	movaps	0x80(%rsp), %xmm14
1275	movaps	0x90(%rsp), %xmm15
1276	lea	0xa8(%rsp), %rsp
1277.LSEH_end_ecp_nistz256_select_w7:
1278___
1279$code.=<<___;
1280	ret
1281.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1282___
1283}
1284if ($avx>1) {
1285my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1286my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
1287my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
1288my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
1289
1290$code.=<<___;
1291################################################################################
1292# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
1293.type	ecp_nistz256_avx2_select_w5,\@abi-omnipotent
1294.align	32
1295ecp_nistz256_avx2_select_w5:
1296.Lavx2_select_w5:
1297	vzeroupper
1298___
1299$code.=<<___	if ($win64);
1300	lea	-0x88(%rsp), %rax
1301.LSEH_begin_ecp_nistz256_avx2_select_w5:
1302	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1303	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
1304	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
1305	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
1306	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
1307	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
1308	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
1309	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
1310	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
1311	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
1312	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
1313___
1314$code.=<<___;
1315	vmovdqa	.LTwo(%rip), $TWO
1316
1317	vpxor	$Ra, $Ra, $Ra
1318	vpxor	$Rb, $Rb, $Rb
1319	vpxor	$Rc, $Rc, $Rc
1320
1321	vmovdqa .LOne(%rip), $M0
1322	vmovdqa .LTwo(%rip), $M1
1323
1324	vmovd	$index, %xmm1
1325	vpermd	$INDEX, $Ra, $INDEX
1326
1327	mov	\$8, %rax
1328.Lselect_loop_avx2_w5:
1329
1330	vmovdqa	32*0($in_t), $T0a
1331	vmovdqa	32*1($in_t), $T0b
1332	vmovdqa	32*2($in_t), $T0c
1333
1334	vmovdqa	32*3($in_t), $T1a
1335	vmovdqa	32*4($in_t), $T1b
1336	vmovdqa	32*5($in_t), $T1c
1337
1338	vpcmpeqd	$INDEX, $M0, $TMP0
1339	vpcmpeqd	$INDEX, $M1, $TMP1
1340
1341	vpaddd	$TWO, $M0, $M0
1342	vpaddd	$TWO, $M1, $M1
1343	lea	32*6($in_t), $in_t
1344
1345	vpand	$TMP0, $T0a, $T0a
1346	vpand	$TMP0, $T0b, $T0b
1347	vpand	$TMP0, $T0c, $T0c
1348	vpand	$TMP1, $T1a, $T1a
1349	vpand	$TMP1, $T1b, $T1b
1350	vpand	$TMP1, $T1c, $T1c
1351
1352	vpxor	$T0a, $Ra, $Ra
1353	vpxor	$T0b, $Rb, $Rb
1354	vpxor	$T0c, $Rc, $Rc
1355	vpxor	$T1a, $Ra, $Ra
1356	vpxor	$T1b, $Rb, $Rb
1357	vpxor	$T1c, $Rc, $Rc
1358
1359	dec %rax
1360	jnz .Lselect_loop_avx2_w5
1361
1362	vmovdqu $Ra, 32*0($val)
1363	vmovdqu $Rb, 32*1($val)
1364	vmovdqu $Rc, 32*2($val)
1365	vzeroupper
1366___
1367$code.=<<___	if ($win64);
1368	movaps	(%rsp), %xmm6
1369	movaps	0x10(%rsp), %xmm7
1370	movaps	0x20(%rsp), %xmm8
1371	movaps	0x30(%rsp), %xmm9
1372	movaps	0x40(%rsp), %xmm10
1373	movaps	0x50(%rsp), %xmm11
1374	movaps	0x60(%rsp), %xmm12
1375	movaps	0x70(%rsp), %xmm13
1376	movaps	0x80(%rsp), %xmm14
1377	movaps	0x90(%rsp), %xmm15
1378	lea	0xa8(%rsp), %rsp
1379.LSEH_end_ecp_nistz256_avx2_select_w5:
1380___
1381$code.=<<___;
1382	ret
1383.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
1384___
1385}
1386if ($avx>1) {
1387my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1388my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
1389my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
1390my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
1391my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
1392
1393$code.=<<___;
1394
1395################################################################################
1396# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
1397.globl	ecp_nistz256_avx2_select_w7
1398.type	ecp_nistz256_avx2_select_w7,\@abi-omnipotent
1399.align	32
1400ecp_nistz256_avx2_select_w7:
1401.Lavx2_select_w7:
1402	vzeroupper
1403___
1404$code.=<<___	if ($win64);
1405	lea	-0x88(%rsp), %rax
1406.LSEH_begin_ecp_nistz256_avx2_select_w7:
1407	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1408	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
1409	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
1410	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
1411	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
1412	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
1413	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
1414	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
1415	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
1416	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
1417	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
1418___
1419$code.=<<___;
1420	vmovdqa	.LThree(%rip), $THREE
1421
1422	vpxor	$Ra, $Ra, $Ra
1423	vpxor	$Rb, $Rb, $Rb
1424
1425	vmovdqa .LOne(%rip), $M0
1426	vmovdqa .LTwo(%rip), $M1
1427	vmovdqa .LThree(%rip), $M2
1428
1429	vmovd	$index, %xmm1
1430	vpermd	$INDEX, $Ra, $INDEX
1431	# Skip index = 0, because it is implicitly the point at infinity
1432
1433	mov	\$21, %rax
1434.Lselect_loop_avx2_w7:
1435
1436	vmovdqa	32*0($in_t), $T0a
1437	vmovdqa	32*1($in_t), $T0b
1438
1439	vmovdqa	32*2($in_t), $T1a
1440	vmovdqa	32*3($in_t), $T1b
1441
1442	vmovdqa	32*4($in_t), $T2a
1443	vmovdqa	32*5($in_t), $T2b
1444
1445	vpcmpeqd	$INDEX, $M0, $TMP0
1446	vpcmpeqd	$INDEX, $M1, $TMP1
1447	vpcmpeqd	$INDEX, $M2, $TMP2
1448
1449	vpaddd	$THREE, $M0, $M0
1450	vpaddd	$THREE, $M1, $M1
1451	vpaddd	$THREE, $M2, $M2
1452	lea	32*6($in_t), $in_t
1453
1454	vpand	$TMP0, $T0a, $T0a
1455	vpand	$TMP0, $T0b, $T0b
1456	vpand	$TMP1, $T1a, $T1a
1457	vpand	$TMP1, $T1b, $T1b
1458	vpand	$TMP2, $T2a, $T2a
1459	vpand	$TMP2, $T2b, $T2b
1460
1461	vpxor	$T0a, $Ra, $Ra
1462	vpxor	$T0b, $Rb, $Rb
1463	vpxor	$T1a, $Ra, $Ra
1464	vpxor	$T1b, $Rb, $Rb
1465	vpxor	$T2a, $Ra, $Ra
1466	vpxor	$T2b, $Rb, $Rb
1467
1468	dec %rax
1469	jnz .Lselect_loop_avx2_w7
1470
1471
1472	vmovdqa	32*0($in_t), $T0a
1473	vmovdqa	32*1($in_t), $T0b
1474
1475	vpcmpeqd	$INDEX, $M0, $TMP0
1476
1477	vpand	$TMP0, $T0a, $T0a
1478	vpand	$TMP0, $T0b, $T0b
1479
1480	vpxor	$T0a, $Ra, $Ra
1481	vpxor	$T0b, $Rb, $Rb
1482
1483	vmovdqu $Ra, 32*0($val)
1484	vmovdqu $Rb, 32*1($val)
1485	vzeroupper
1486___
1487$code.=<<___	if ($win64);
1488	movaps	(%rsp), %xmm6
1489	movaps	0x10(%rsp), %xmm7
1490	movaps	0x20(%rsp), %xmm8
1491	movaps	0x30(%rsp), %xmm9
1492	movaps	0x40(%rsp), %xmm10
1493	movaps	0x50(%rsp), %xmm11
1494	movaps	0x60(%rsp), %xmm12
1495	movaps	0x70(%rsp), %xmm13
1496	movaps	0x80(%rsp), %xmm14
1497	movaps	0x90(%rsp), %xmm15
1498	lea	0xa8(%rsp), %rsp
1499.LSEH_end_ecp_nistz256_avx2_select_w7:
1500___
1501$code.=<<___;
1502	ret
1503.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1504___
1505} else {
1506$code.=<<___;
1507.globl	ecp_nistz256_avx2_select_w7
1508.type	ecp_nistz256_avx2_select_w7,\@function,3
1509.align	32
1510ecp_nistz256_avx2_select_w7:
1511	.byte	0x0f,0x0b	# ud2
1512	ret
1513.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1514___
1515}
1516{{{
1517########################################################################
1518# This block implements higher level point_double, point_add and
1519# point_add_affine. The key to performance in this case is to allow
1520# out-of-order execution logic to overlap computations from next step
1521# with tail processing from current step. By using tailored calling
1522# sequence we minimize inter-step overhead to give processor better
1523# shot at overlapping operations...
1524#
1525# You will notice that input data is copied to stack. Trouble is that
1526# there are no registers to spare for holding original pointers and
1527# reloading them, pointers, would create undesired dependencies on
1528# effective addresses calculation paths. In other words it's too done
1529# to favour out-of-order execution logic.
1530#						<appro@openssl.org>
1531
1532my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
1533my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
1534my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
1535my ($poly1,$poly3)=($acc6,$acc7);
1536
1537sub load_for_mul () {
1538my ($a,$b,$src0) = @_;
1539my $bias = $src0 eq "%rax" ? 0 : -128;
1540
1541"	mov	$b, $src0
1542	lea	$b, $b_ptr
1543	mov	8*0+$a, $acc1
1544	mov	8*1+$a, $acc2
1545	lea	$bias+$a, $a_ptr
1546	mov	8*2+$a, $acc3
1547	mov	8*3+$a, $acc4"
1548}
1549
1550sub load_for_sqr () {
1551my ($a,$src0) = @_;
1552my $bias = $src0 eq "%rax" ? 0 : -128;
1553
1554"	mov	8*0+$a, $src0
1555	mov	8*1+$a, $acc6
1556	lea	$bias+$a, $a_ptr
1557	mov	8*2+$a, $acc7
1558	mov	8*3+$a, $acc0"
1559}
1560
1561									{
1562########################################################################
1563# operate in 4-5-0-1 "name space" that matches multiplication output
1564#
1565my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1566
1567$code.=<<___;
1568.type	__ecp_nistz256_add_toq,\@abi-omnipotent
1569.align	32
1570__ecp_nistz256_add_toq:
1571	add	8*0($b_ptr), $a0
1572	adc	8*1($b_ptr), $a1
1573	 mov	$a0, $t0
1574	adc	8*2($b_ptr), $a2
1575	adc	8*3($b_ptr), $a3
1576	 mov	$a1, $t1
1577	sbb	$t4, $t4
1578
1579	sub	\$-1, $a0
1580	 mov	$a2, $t2
1581	sbb	$poly1, $a1
1582	sbb	\$0, $a2
1583	 mov	$a3, $t3
1584	sbb	$poly3, $a3
1585	test	$t4, $t4
1586
1587	cmovz	$t0, $a0
1588	cmovz	$t1, $a1
1589	mov	$a0, 8*0($r_ptr)
1590	cmovz	$t2, $a2
1591	mov	$a1, 8*1($r_ptr)
1592	cmovz	$t3, $a3
1593	mov	$a2, 8*2($r_ptr)
1594	mov	$a3, 8*3($r_ptr)
1595
1596	ret
1597.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1598
1599.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
1600.align	32
1601__ecp_nistz256_sub_fromq:
1602	sub	8*0($b_ptr), $a0
1603	sbb	8*1($b_ptr), $a1
1604	 mov	$a0, $t0
1605	sbb	8*2($b_ptr), $a2
1606	sbb	8*3($b_ptr), $a3
1607	 mov	$a1, $t1
1608	sbb	$t4, $t4
1609
1610	add	\$-1, $a0
1611	 mov	$a2, $t2
1612	adc	$poly1, $a1
1613	adc	\$0, $a2
1614	 mov	$a3, $t3
1615	adc	$poly3, $a3
1616	test	$t4, $t4
1617
1618	cmovz	$t0, $a0
1619	cmovz	$t1, $a1
1620	mov	$a0, 8*0($r_ptr)
1621	cmovz	$t2, $a2
1622	mov	$a1, 8*1($r_ptr)
1623	cmovz	$t3, $a3
1624	mov	$a2, 8*2($r_ptr)
1625	mov	$a3, 8*3($r_ptr)
1626
1627	ret
1628.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1629
1630.type	__ecp_nistz256_subq,\@abi-omnipotent
1631.align	32
1632__ecp_nistz256_subq:
1633	sub	$a0, $t0
1634	sbb	$a1, $t1
1635	 mov	$t0, $a0
1636	sbb	$a2, $t2
1637	sbb	$a3, $t3
1638	 mov	$t1, $a1
1639	sbb	$t4, $t4
1640
1641	add	\$-1, $t0
1642	 mov	$t2, $a2
1643	adc	$poly1, $t1
1644	adc	\$0, $t2
1645	 mov	$t3, $a3
1646	adc	$poly3, $t3
1647	test	$t4, $t4
1648
1649	cmovnz	$t0, $a0
1650	cmovnz	$t1, $a1
1651	cmovnz	$t2, $a2
1652	cmovnz	$t3, $a3
1653
1654	ret
1655.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
1656
1657.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
1658.align	32
1659__ecp_nistz256_mul_by_2q:
1660	add	$a0, $a0		# a0:a3+a0:a3
1661	adc	$a1, $a1
1662	 mov	$a0, $t0
1663	adc	$a2, $a2
1664	adc	$a3, $a3
1665	 mov	$a1, $t1
1666	sbb	$t4, $t4
1667
1668	sub	\$-1, $a0
1669	 mov	$a2, $t2
1670	sbb	$poly1, $a1
1671	sbb	\$0, $a2
1672	 mov	$a3, $t3
1673	sbb	$poly3, $a3
1674	test	$t4, $t4
1675
1676	cmovz	$t0, $a0
1677	cmovz	$t1, $a1
1678	mov	$a0, 8*0($r_ptr)
1679	cmovz	$t2, $a2
1680	mov	$a1, 8*1($r_ptr)
1681	cmovz	$t3, $a3
1682	mov	$a2, 8*2($r_ptr)
1683	mov	$a3, 8*3($r_ptr)
1684
1685	ret
1686.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1687___
1688									}
1689sub gen_double () {
1690    my $x = shift;
1691    my ($src0,$sfx,$bias);
1692    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1693
1694    if ($x ne "x") {
1695	$src0 = "%rax";
1696	$sfx  = "";
1697	$bias = 0;
1698
1699$code.=<<___;
1700.globl	ecp_nistz256_point_double
1701.type	ecp_nistz256_point_double,\@function,2
1702.align	32
1703ecp_nistz256_point_double:
1704___
1705$code.=<<___	if ($addx);
1706	mov	\$0x80100, %ecx
1707	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1708	cmp	\$0x80100, %ecx
1709	je	.Lpoint_doublex
1710___
1711    } else {
1712	$src0 = "%rdx";
1713	$sfx  = "x";
1714	$bias = 128;
1715
1716$code.=<<___;
1717.type	ecp_nistz256_point_doublex,\@function,2
1718.align	32
1719ecp_nistz256_point_doublex:
1720.Lpoint_doublex:
1721___
1722    }
1723$code.=<<___;
1724	push	%rbp
1725	push	%rbx
1726	push	%r12
1727	push	%r13
1728	push	%r14
1729	push	%r15
1730	sub	\$32*5+8, %rsp
1731
1732	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
1733	mov	$a_ptr, $b_ptr			# backup copy
1734	movdqu	0x10($a_ptr), %xmm1
1735	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
1736	 mov	0x20+8*1($a_ptr), $acc5
1737	 mov	0x20+8*2($a_ptr), $acc0
1738	 mov	0x20+8*3($a_ptr), $acc1
1739	 mov	.Lpoly+8*1(%rip), $poly1
1740	 mov	.Lpoly+8*3(%rip), $poly3
1741	movdqa	%xmm0, $in_x(%rsp)
1742	movdqa	%xmm1, $in_x+0x10(%rsp)
1743	lea	0x20($r_ptr), $acc2
1744	lea	0x40($r_ptr), $acc3
1745	movq	$r_ptr, %xmm0
1746	movq	$acc2, %xmm1
1747	movq	$acc3, %xmm2
1748
1749	lea	$S(%rsp), $r_ptr
1750	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
1751
1752	mov	0x40+8*0($a_ptr), $src0
1753	mov	0x40+8*1($a_ptr), $acc6
1754	mov	0x40+8*2($a_ptr), $acc7
1755	mov	0x40+8*3($a_ptr), $acc0
1756	lea	0x40-$bias($a_ptr), $a_ptr
1757	lea	$Zsqr(%rsp), $r_ptr
1758	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
1759
1760	`&load_for_sqr("$S(%rsp)", "$src0")`
1761	lea	$S(%rsp), $r_ptr
1762	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
1763
1764	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
1765	mov	0x40+8*0($b_ptr), $acc1
1766	mov	0x40+8*1($b_ptr), $acc2
1767	mov	0x40+8*2($b_ptr), $acc3
1768	mov	0x40+8*3($b_ptr), $acc4
1769	lea	0x40-$bias($b_ptr), $a_ptr
1770	lea	0x20($b_ptr), $b_ptr
1771	movq	%xmm2, $r_ptr
1772	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
1773	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
1774
1775	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
1776	mov	$in_x+8*1(%rsp), $acc5
1777	lea	$Zsqr(%rsp), $b_ptr
1778	mov	$in_x+8*2(%rsp), $acc0
1779	mov	$in_x+8*3(%rsp), $acc1
1780	lea	$M(%rsp), $r_ptr
1781	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
1782
1783	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
1784	mov	$in_x+8*1(%rsp), $acc5
1785	lea	$Zsqr(%rsp), $b_ptr
1786	mov	$in_x+8*2(%rsp), $acc0
1787	mov	$in_x+8*3(%rsp), $acc1
1788	lea	$Zsqr(%rsp), $r_ptr
1789	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
1790
1791	`&load_for_sqr("$S(%rsp)", "$src0")`
1792	movq	%xmm1, $r_ptr
1793	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
1794___
1795{
1796######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1797# operate in 4-5-6-7 "name space" that matches squaring output
1798#
1799my ($poly1,$poly3)=($a_ptr,$t1);
1800my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1801
1802$code.=<<___;
1803	xor	$t4, $t4
1804	mov	$a0, $t0
1805	add	\$-1, $a0
1806	mov	$a1, $t1
1807	adc	$poly1, $a1
1808	mov	$a2, $t2
1809	adc	\$0, $a2
1810	mov	$a3, $t3
1811	adc	$poly3, $a3
1812	adc	\$0, $t4
1813	xor	$a_ptr, $a_ptr		# borrow $a_ptr
1814	test	\$1, $t0
1815
1816	cmovz	$t0, $a0
1817	cmovz	$t1, $a1
1818	cmovz	$t2, $a2
1819	cmovz	$t3, $a3
1820	cmovz	$a_ptr, $t4
1821
1822	mov	$a1, $t0		# a0:a3>>1
1823	shr	\$1, $a0
1824	shl	\$63, $t0
1825	mov	$a2, $t1
1826	shr	\$1, $a1
1827	or	$t0, $a0
1828	shl	\$63, $t1
1829	mov	$a3, $t2
1830	shr	\$1, $a2
1831	or	$t1, $a1
1832	shl	\$63, $t2
1833	mov	$a0, 8*0($r_ptr)
1834	shr	\$1, $a3
1835	mov	$a1, 8*1($r_ptr)
1836	shl	\$63, $t4
1837	or	$t2, $a2
1838	or	$t4, $a3
1839	mov	$a2, 8*2($r_ptr)
1840	mov	$a3, 8*3($r_ptr)
1841___
1842}
1843$code.=<<___;
1844	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1845	lea	$M(%rsp), $r_ptr
1846	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
1847
1848	lea	$tmp0(%rsp), $r_ptr
1849	call	__ecp_nistz256_mul_by_2$x
1850
1851	lea	$M(%rsp), $b_ptr
1852	lea	$M(%rsp), $r_ptr
1853	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
1854
1855	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1856	lea	$S(%rsp), $r_ptr
1857	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
1858
1859	lea	$tmp0(%rsp), $r_ptr
1860	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
1861
1862	`&load_for_sqr("$M(%rsp)", "$src0")`
1863	movq	%xmm0, $r_ptr
1864	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
1865
1866	lea	$tmp0(%rsp), $b_ptr
1867	mov	$acc6, $acc0			# harmonize sqr output and sub input
1868	mov	$acc7, $acc1
1869	mov	$a_ptr, $poly1
1870	mov	$t1, $poly3
1871	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
1872
1873	mov	$S+8*0(%rsp), $t0
1874	mov	$S+8*1(%rsp), $t1
1875	mov	$S+8*2(%rsp), $t2
1876	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
1877	lea	$S(%rsp), $r_ptr
1878	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
1879
1880	mov	$M(%rsp), $src0
1881	lea	$M(%rsp), $b_ptr
1882	mov	$acc4, $acc6			# harmonize sub output and mul input
1883	xor	%ecx, %ecx
1884	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
1885	mov	$acc5, $acc2
1886	mov	$acc5, $S+8*1(%rsp)
1887	cmovz	$acc0, $acc3
1888	mov	$acc0, $S+8*2(%rsp)
1889	lea	$S-$bias(%rsp), $a_ptr
1890	cmovz	$acc1, $acc4
1891	mov	$acc1, $S+8*3(%rsp)
1892	mov	$acc6, $acc1
1893	lea	$S(%rsp), $r_ptr
1894	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
1895
1896	movq	%xmm1, $b_ptr
1897	movq	%xmm1, $r_ptr
1898	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
1899
1900	add	\$32*5+8, %rsp
1901	pop	%r15
1902	pop	%r14
1903	pop	%r13
1904	pop	%r12
1905	pop	%rbx
1906	pop	%rbp
1907	ret
1908.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
1909___
1910}
1911&gen_double("q");
1912
1913sub gen_add () {
1914    my $x = shift;
1915    my ($src0,$sfx,$bias);
1916    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1917	$U1,$U2,$S1,$S2,
1918	$res_x,$res_y,$res_z,
1919	$in1_x,$in1_y,$in1_z,
1920	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1921    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1922
1923    if ($x ne "x") {
1924	$src0 = "%rax";
1925	$sfx  = "";
1926	$bias = 0;
1927
1928$code.=<<___;
1929.globl	ecp_nistz256_point_add
1930.type	ecp_nistz256_point_add,\@function,3
1931.align	32
1932ecp_nistz256_point_add:
1933___
1934$code.=<<___	if ($addx);
1935	mov	\$0x80100, %ecx
1936	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1937	cmp	\$0x80100, %ecx
1938	je	.Lpoint_addx
1939___
1940    } else {
1941	$src0 = "%rdx";
1942	$sfx  = "x";
1943	$bias = 128;
1944
1945$code.=<<___;
1946.type	ecp_nistz256_point_addx,\@function,3
1947.align	32
1948ecp_nistz256_point_addx:
1949.Lpoint_addx:
1950___
1951    }
1952$code.=<<___;
1953	push	%rbp
1954	push	%rbx
1955	push	%r12
1956	push	%r13
1957	push	%r14
1958	push	%r15
1959	sub	\$32*18+8, %rsp
1960
1961	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
1962	movdqu	0x10($a_ptr), %xmm1
1963	movdqu	0x20($a_ptr), %xmm2
1964	movdqu	0x30($a_ptr), %xmm3
1965	movdqu	0x40($a_ptr), %xmm4
1966	movdqu	0x50($a_ptr), %xmm5
1967	mov	$a_ptr, $b_ptr			# reassign
1968	mov	$b_org, $a_ptr			# reassign
1969	movdqa	%xmm0, $in1_x(%rsp)
1970	movdqa	%xmm1, $in1_x+0x10(%rsp)
1971	por	%xmm0, %xmm1
1972	movdqa	%xmm2, $in1_y(%rsp)
1973	movdqa	%xmm3, $in1_y+0x10(%rsp)
1974	por	%xmm2, %xmm3
1975	movdqa	%xmm4, $in1_z(%rsp)
1976	movdqa	%xmm5, $in1_z+0x10(%rsp)
1977	por	%xmm1, %xmm3
1978
1979	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
1980	 pshufd	\$0xb1, %xmm3, %xmm5
1981	movdqu	0x10($a_ptr), %xmm1
1982	movdqu	0x20($a_ptr), %xmm2
1983	 por	%xmm3, %xmm5
1984	movdqu	0x30($a_ptr), %xmm3
1985	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
1986	 mov	0x40+8*1($a_ptr), $acc6
1987	 mov	0x40+8*2($a_ptr), $acc7
1988	 mov	0x40+8*3($a_ptr), $acc0
1989	movdqa	%xmm0, $in2_x(%rsp)
1990	 pshufd	\$0x1e, %xmm5, %xmm4
1991	movdqa	%xmm1, $in2_x+0x10(%rsp)
1992	por	%xmm0, %xmm1
1993	 movq	$r_ptr, %xmm0			# save $r_ptr
1994	movdqa	%xmm2, $in2_y(%rsp)
1995	movdqa	%xmm3, $in2_y+0x10(%rsp)
1996	por	%xmm2, %xmm3
1997	 por	%xmm4, %xmm5
1998	 pxor	%xmm4, %xmm4
1999	por	%xmm1, %xmm3
2000
2001	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
2002	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
2003	 mov	$acc6, $in2_z+8*1(%rsp)
2004	 mov	$acc7, $in2_z+8*2(%rsp)
2005	 mov	$acc0, $in2_z+8*3(%rsp)
2006	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
2007	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
2008
2009	pcmpeqd	%xmm4, %xmm5
2010	pshufd	\$0xb1, %xmm3, %xmm4
2011	por	%xmm3, %xmm4
2012	pshufd	\$0, %xmm5, %xmm5		# in1infty
2013	pshufd	\$0x1e, %xmm4, %xmm3
2014	por	%xmm3, %xmm4
2015	pxor	%xmm3, %xmm3
2016	pcmpeqd	%xmm3, %xmm4
2017	pshufd	\$0, %xmm4, %xmm4		# in2infty
2018	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
2019	 mov	0x40+8*1($b_ptr), $acc6
2020	 mov	0x40+8*2($b_ptr), $acc7
2021	 mov	0x40+8*3($b_ptr), $acc0
2022
2023	lea	0x40-$bias($b_ptr), $a_ptr
2024	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
2025	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
2026
2027	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
2028	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
2029	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
2030
2031	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2032	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
2033	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
2034
2035	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
2036	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
2037	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
2038
2039	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2040	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
2041	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
2042
2043	lea	$S1(%rsp), $b_ptr
2044	lea	$R(%rsp), $r_ptr		# R = S2 - S1
2045	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
2046
2047	or	$acc5, $acc4			# see if result is zero
2048	movdqa	%xmm4, %xmm2
2049	or	$acc0, $acc4
2050	or	$acc1, $acc4
2051	por	%xmm5, %xmm2			# in1infty || in2infty
2052	movq	$acc4, %xmm3
2053
2054	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2055	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
2056	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
2057
2058	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
2059	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
2060	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
2061
2062	lea	$U1(%rsp), $b_ptr
2063	lea	$H(%rsp), $r_ptr		# H = U2 - U1
2064	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
2065
2066	or	$acc5, $acc4			# see if result is zero
2067	or	$acc0, $acc4
2068	or	$acc1, $acc4
2069
2070	.byte	0x3e				# predict taken
2071	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
2072	movq	%xmm2, $acc0
2073	movq	%xmm3, $acc1
2074	test	$acc0, $acc0
2075	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
2076	test	$acc1, $acc1
2077	jz	.Ladd_proceed$x			# is_equal(S1,S2)?
2078
2079	movq	%xmm0, $r_ptr			# restore $r_ptr
2080	pxor	%xmm0, %xmm0
2081	movdqu	%xmm0, 0x00($r_ptr)
2082	movdqu	%xmm0, 0x10($r_ptr)
2083	movdqu	%xmm0, 0x20($r_ptr)
2084	movdqu	%xmm0, 0x30($r_ptr)
2085	movdqu	%xmm0, 0x40($r_ptr)
2086	movdqu	%xmm0, 0x50($r_ptr)
2087	jmp	.Ladd_done$x
2088
2089.align	32
2090.Ladd_proceed$x:
2091	`&load_for_sqr("$R(%rsp)", "$src0")`
2092	lea	$Rsqr(%rsp), $r_ptr		# R^2
2093	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
2094
2095	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2096	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2097	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
2098
2099	`&load_for_sqr("$H(%rsp)", "$src0")`
2100	lea	$Hsqr(%rsp), $r_ptr		# H^2
2101	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
2102
2103	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
2104	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2105	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
2106
2107	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
2108	lea	$Hcub(%rsp), $r_ptr		# H^3
2109	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
2110
2111	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
2112	lea	$U2(%rsp), $r_ptr		# U1*H^2
2113	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
2114___
2115{
2116#######################################################################
2117# operate in 4-5-0-1 "name space" that matches multiplication output
2118#
2119my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2120my ($poly1, $poly3)=($acc6,$acc7);
2121
2122$code.=<<___;
2123	#lea	$U2(%rsp), $a_ptr
2124	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
2125	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
2126
2127	add	$acc0, $acc0		# a0:a3+a0:a3
2128	lea	$Rsqr(%rsp), $a_ptr
2129	adc	$acc1, $acc1
2130	 mov	$acc0, $t0
2131	adc	$acc2, $acc2
2132	adc	$acc3, $acc3
2133	 mov	$acc1, $t1
2134	sbb	$t4, $t4
2135
2136	sub	\$-1, $acc0
2137	 mov	$acc2, $t2
2138	sbb	$poly1, $acc1
2139	sbb	\$0, $acc2
2140	 mov	$acc3, $t3
2141	sbb	$poly3, $acc3
2142	test	$t4, $t4
2143
2144	cmovz	$t0, $acc0
2145	mov	8*0($a_ptr), $t0
2146	cmovz	$t1, $acc1
2147	mov	8*1($a_ptr), $t1
2148	cmovz	$t2, $acc2
2149	mov	8*2($a_ptr), $t2
2150	cmovz	$t3, $acc3
2151	mov	8*3($a_ptr), $t3
2152
2153	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
2154
2155	lea	$Hcub(%rsp), $b_ptr
2156	lea	$res_x(%rsp), $r_ptr
2157	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
2158
2159	mov	$U2+8*0(%rsp), $t0
2160	mov	$U2+8*1(%rsp), $t1
2161	mov	$U2+8*2(%rsp), $t2
2162	mov	$U2+8*3(%rsp), $t3
2163	lea	$res_y(%rsp), $r_ptr
2164
2165	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
2166
2167	mov	$acc0, 8*0($r_ptr)		# save the result, as
2168	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
2169	mov	$acc2, 8*2($r_ptr)
2170	mov	$acc3, 8*3($r_ptr)
2171___
2172}
2173$code.=<<___;
2174	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
2175	lea	$S2(%rsp), $r_ptr
2176	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
2177
2178	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
2179	lea	$res_y(%rsp), $r_ptr
2180	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
2181
2182	lea	$S2(%rsp), $b_ptr
2183	lea	$res_y(%rsp), $r_ptr
2184	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
2185
2186	movq	%xmm0, $r_ptr		# restore $r_ptr
2187
2188	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
2189	movdqa	%xmm5, %xmm1
2190	pandn	$res_z(%rsp), %xmm0
2191	movdqa	%xmm5, %xmm2
2192	pandn	$res_z+0x10(%rsp), %xmm1
2193	movdqa	%xmm5, %xmm3
2194	pand	$in2_z(%rsp), %xmm2
2195	pand	$in2_z+0x10(%rsp), %xmm3
2196	por	%xmm0, %xmm2
2197	por	%xmm1, %xmm3
2198
2199	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
2200	movdqa	%xmm4, %xmm1
2201	pandn	%xmm2, %xmm0
2202	movdqa	%xmm4, %xmm2
2203	pandn	%xmm3, %xmm1
2204	movdqa	%xmm4, %xmm3
2205	pand	$in1_z(%rsp), %xmm2
2206	pand	$in1_z+0x10(%rsp), %xmm3
2207	por	%xmm0, %xmm2
2208	por	%xmm1, %xmm3
2209	movdqu	%xmm2, 0x40($r_ptr)
2210	movdqu	%xmm3, 0x50($r_ptr)
2211
2212	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
2213	movdqa	%xmm5, %xmm1
2214	pandn	$res_x(%rsp), %xmm0
2215	movdqa	%xmm5, %xmm2
2216	pandn	$res_x+0x10(%rsp), %xmm1
2217	movdqa	%xmm5, %xmm3
2218	pand	$in2_x(%rsp), %xmm2
2219	pand	$in2_x+0x10(%rsp), %xmm3
2220	por	%xmm0, %xmm2
2221	por	%xmm1, %xmm3
2222
2223	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
2224	movdqa	%xmm4, %xmm1
2225	pandn	%xmm2, %xmm0
2226	movdqa	%xmm4, %xmm2
2227	pandn	%xmm3, %xmm1
2228	movdqa	%xmm4, %xmm3
2229	pand	$in1_x(%rsp), %xmm2
2230	pand	$in1_x+0x10(%rsp), %xmm3
2231	por	%xmm0, %xmm2
2232	por	%xmm1, %xmm3
2233	movdqu	%xmm2, 0x00($r_ptr)
2234	movdqu	%xmm3, 0x10($r_ptr)
2235
2236	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
2237	movdqa	%xmm5, %xmm1
2238	pandn	$res_y(%rsp), %xmm0
2239	movdqa	%xmm5, %xmm2
2240	pandn	$res_y+0x10(%rsp), %xmm1
2241	movdqa	%xmm5, %xmm3
2242	pand	$in2_y(%rsp), %xmm2
2243	pand	$in2_y+0x10(%rsp), %xmm3
2244	por	%xmm0, %xmm2
2245	por	%xmm1, %xmm3
2246
2247	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
2248	movdqa	%xmm4, %xmm1
2249	pandn	%xmm2, %xmm0
2250	movdqa	%xmm4, %xmm2
2251	pandn	%xmm3, %xmm1
2252	movdqa	%xmm4, %xmm3
2253	pand	$in1_y(%rsp), %xmm2
2254	pand	$in1_y+0x10(%rsp), %xmm3
2255	por	%xmm0, %xmm2
2256	por	%xmm1, %xmm3
2257	movdqu	%xmm2, 0x20($r_ptr)
2258	movdqu	%xmm3, 0x30($r_ptr)
2259
2260.Ladd_done$x:
2261	add	\$32*18+8, %rsp
2262	pop	%r15
2263	pop	%r14
2264	pop	%r13
2265	pop	%r12
2266	pop	%rbx
2267	pop	%rbp
2268	ret
2269.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
2270___
2271}
2272&gen_add("q");
2273
2274sub gen_add_affine () {
2275    my $x = shift;
2276    my ($src0,$sfx,$bias);
2277    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
2278	$res_x,$res_y,$res_z,
2279	$in1_x,$in1_y,$in1_z,
2280	$in2_x,$in2_y)=map(32*$_,(0..14));
2281    my $Z1sqr = $S2;
2282
2283    if ($x ne "x") {
2284	$src0 = "%rax";
2285	$sfx  = "";
2286	$bias = 0;
2287
2288$code.=<<___;
2289.globl	ecp_nistz256_point_add_affine
2290.type	ecp_nistz256_point_add_affine,\@function,3
2291.align	32
2292ecp_nistz256_point_add_affine:
2293___
2294$code.=<<___	if ($addx);
2295	mov	\$0x80100, %ecx
2296	and	OPENSSL_ia32cap_P+8(%rip), %ecx
2297	cmp	\$0x80100, %ecx
2298	je	.Lpoint_add_affinex
2299___
2300    } else {
2301	$src0 = "%rdx";
2302	$sfx  = "x";
2303	$bias = 128;
2304
2305$code.=<<___;
2306.type	ecp_nistz256_point_add_affinex,\@function,3
2307.align	32
2308ecp_nistz256_point_add_affinex:
2309.Lpoint_add_affinex:
2310___
2311    }
2312$code.=<<___;
2313	push	%rbp
2314	push	%rbx
2315	push	%r12
2316	push	%r13
2317	push	%r14
2318	push	%r15
2319	sub	\$32*15+8, %rsp
2320
2321	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
2322	mov	$b_org, $b_ptr		# reassign
2323	movdqu	0x10($a_ptr), %xmm1
2324	movdqu	0x20($a_ptr), %xmm2
2325	movdqu	0x30($a_ptr), %xmm3
2326	movdqu	0x40($a_ptr), %xmm4
2327	movdqu	0x50($a_ptr), %xmm5
2328	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
2329	 mov	0x40+8*1($a_ptr), $acc6
2330	 mov	0x40+8*2($a_ptr), $acc7
2331	 mov	0x40+8*3($a_ptr), $acc0
2332	movdqa	%xmm0, $in1_x(%rsp)
2333	movdqa	%xmm1, $in1_x+0x10(%rsp)
2334	por	%xmm0, %xmm1
2335	movdqa	%xmm2, $in1_y(%rsp)
2336	movdqa	%xmm3, $in1_y+0x10(%rsp)
2337	por	%xmm2, %xmm3
2338	movdqa	%xmm4, $in1_z(%rsp)
2339	movdqa	%xmm5, $in1_z+0x10(%rsp)
2340	por	%xmm1, %xmm3
2341
2342	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
2343	 pshufd	\$0xb1, %xmm3, %xmm5
2344	movdqu	0x10($b_ptr), %xmm1
2345	movdqu	0x20($b_ptr), %xmm2
2346	 por	%xmm3, %xmm5
2347	movdqu	0x30($b_ptr), %xmm3
2348	movdqa	%xmm0, $in2_x(%rsp)
2349	 pshufd	\$0x1e, %xmm5, %xmm4
2350	movdqa	%xmm1, $in2_x+0x10(%rsp)
2351	por	%xmm0, %xmm1
2352	 movq	$r_ptr, %xmm0		# save $r_ptr
2353	movdqa	%xmm2, $in2_y(%rsp)
2354	movdqa	%xmm3, $in2_y+0x10(%rsp)
2355	por	%xmm2, %xmm3
2356	 por	%xmm4, %xmm5
2357	 pxor	%xmm4, %xmm4
2358	por	%xmm1, %xmm3
2359
2360	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
2361	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
2362	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
2363
2364	pcmpeqd	%xmm4, %xmm5
2365	pshufd	\$0xb1, %xmm3, %xmm4
2366	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
2367	 #lea	0x00($b_ptr), $b_ptr
2368	 mov	$acc4, $acc1			# harmonize sqr output and mul input
2369	por	%xmm3, %xmm4
2370	pshufd	\$0, %xmm5, %xmm5		# in1infty
2371	pshufd	\$0x1e, %xmm4, %xmm3
2372	 mov	$acc5, $acc2
2373	por	%xmm3, %xmm4
2374	pxor	%xmm3, %xmm3
2375	 mov	$acc6, $acc3
2376	pcmpeqd	%xmm3, %xmm4
2377	pshufd	\$0, %xmm4, %xmm4		# in2infty
2378
2379	lea	$Z1sqr-$bias(%rsp), $a_ptr
2380	mov	$acc7, $acc4
2381	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
2382	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
2383
2384	lea	$in1_x(%rsp), $b_ptr
2385	lea	$H(%rsp), $r_ptr		# H = U2 - U1
2386	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
2387
2388	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2389	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
2390	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
2391
2392	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2393	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2394	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
2395
2396	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2397	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
2398	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
2399
2400	lea	$in1_y(%rsp), $b_ptr
2401	lea	$R(%rsp), $r_ptr		# R = S2 - S1
2402	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
2403
2404	`&load_for_sqr("$H(%rsp)", "$src0")`
2405	lea	$Hsqr(%rsp), $r_ptr		# H^2
2406	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
2407
2408	`&load_for_sqr("$R(%rsp)", "$src0")`
2409	lea	$Rsqr(%rsp), $r_ptr		# R^2
2410	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
2411
2412	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
2413	lea	$Hcub(%rsp), $r_ptr		# H^3
2414	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
2415
2416	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2417	lea	$U2(%rsp), $r_ptr		# U1*H^2
2418	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
2419___
2420{
2421#######################################################################
2422# operate in 4-5-0-1 "name space" that matches multiplication output
2423#
2424my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2425my ($poly1, $poly3)=($acc6,$acc7);
2426
2427$code.=<<___;
2428	#lea	$U2(%rsp), $a_ptr
2429	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
2430	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
2431
2432	add	$acc0, $acc0		# a0:a3+a0:a3
2433	lea	$Rsqr(%rsp), $a_ptr
2434	adc	$acc1, $acc1
2435	 mov	$acc0, $t0
2436	adc	$acc2, $acc2
2437	adc	$acc3, $acc3
2438	 mov	$acc1, $t1
2439	sbb	$t4, $t4
2440
2441	sub	\$-1, $acc0
2442	 mov	$acc2, $t2
2443	sbb	$poly1, $acc1
2444	sbb	\$0, $acc2
2445	 mov	$acc3, $t3
2446	sbb	$poly3, $acc3
2447	test	$t4, $t4
2448
2449	cmovz	$t0, $acc0
2450	mov	8*0($a_ptr), $t0
2451	cmovz	$t1, $acc1
2452	mov	8*1($a_ptr), $t1
2453	cmovz	$t2, $acc2
2454	mov	8*2($a_ptr), $t2
2455	cmovz	$t3, $acc3
2456	mov	8*3($a_ptr), $t3
2457
2458	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
2459
2460	lea	$Hcub(%rsp), $b_ptr
2461	lea	$res_x(%rsp), $r_ptr
2462	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
2463
2464	mov	$U2+8*0(%rsp), $t0
2465	mov	$U2+8*1(%rsp), $t1
2466	mov	$U2+8*2(%rsp), $t2
2467	mov	$U2+8*3(%rsp), $t3
2468	lea	$H(%rsp), $r_ptr
2469
2470	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
2471
2472	mov	$acc0, 8*0($r_ptr)		# save the result, as
2473	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
2474	mov	$acc2, 8*2($r_ptr)
2475	mov	$acc3, 8*3($r_ptr)
2476___
2477}
2478$code.=<<___;
2479	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
2480	lea	$S2(%rsp), $r_ptr
2481	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
2482
2483	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
2484	lea	$H(%rsp), $r_ptr
2485	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
2486
2487	lea	$S2(%rsp), $b_ptr
2488	lea	$res_y(%rsp), $r_ptr
2489	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
2490
2491	movq	%xmm0, $r_ptr		# restore $r_ptr
2492
2493	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
2494	movdqa	%xmm5, %xmm1
2495	pandn	$res_z(%rsp), %xmm0
2496	movdqa	%xmm5, %xmm2
2497	pandn	$res_z+0x10(%rsp), %xmm1
2498	movdqa	%xmm5, %xmm3
2499	pand	.LONE_mont(%rip), %xmm2
2500	pand	.LONE_mont+0x10(%rip), %xmm3
2501	por	%xmm0, %xmm2
2502	por	%xmm1, %xmm3
2503
2504	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
2505	movdqa	%xmm4, %xmm1
2506	pandn	%xmm2, %xmm0
2507	movdqa	%xmm4, %xmm2
2508	pandn	%xmm3, %xmm1
2509	movdqa	%xmm4, %xmm3
2510	pand	$in1_z(%rsp), %xmm2
2511	pand	$in1_z+0x10(%rsp), %xmm3
2512	por	%xmm0, %xmm2
2513	por	%xmm1, %xmm3
2514	movdqu	%xmm2, 0x40($r_ptr)
2515	movdqu	%xmm3, 0x50($r_ptr)
2516
2517	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
2518	movdqa	%xmm5, %xmm1
2519	pandn	$res_x(%rsp), %xmm0
2520	movdqa	%xmm5, %xmm2
2521	pandn	$res_x+0x10(%rsp), %xmm1
2522	movdqa	%xmm5, %xmm3
2523	pand	$in2_x(%rsp), %xmm2
2524	pand	$in2_x+0x10(%rsp), %xmm3
2525	por	%xmm0, %xmm2
2526	por	%xmm1, %xmm3
2527
2528	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
2529	movdqa	%xmm4, %xmm1
2530	pandn	%xmm2, %xmm0
2531	movdqa	%xmm4, %xmm2
2532	pandn	%xmm3, %xmm1
2533	movdqa	%xmm4, %xmm3
2534	pand	$in1_x(%rsp), %xmm2
2535	pand	$in1_x+0x10(%rsp), %xmm3
2536	por	%xmm0, %xmm2
2537	por	%xmm1, %xmm3
2538	movdqu	%xmm2, 0x00($r_ptr)
2539	movdqu	%xmm3, 0x10($r_ptr)
2540
2541	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
2542	movdqa	%xmm5, %xmm1
2543	pandn	$res_y(%rsp), %xmm0
2544	movdqa	%xmm5, %xmm2
2545	pandn	$res_y+0x10(%rsp), %xmm1
2546	movdqa	%xmm5, %xmm3
2547	pand	$in2_y(%rsp), %xmm2
2548	pand	$in2_y+0x10(%rsp), %xmm3
2549	por	%xmm0, %xmm2
2550	por	%xmm1, %xmm3
2551
2552	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
2553	movdqa	%xmm4, %xmm1
2554	pandn	%xmm2, %xmm0
2555	movdqa	%xmm4, %xmm2
2556	pandn	%xmm3, %xmm1
2557	movdqa	%xmm4, %xmm3
2558	pand	$in1_y(%rsp), %xmm2
2559	pand	$in1_y+0x10(%rsp), %xmm3
2560	por	%xmm0, %xmm2
2561	por	%xmm1, %xmm3
2562	movdqu	%xmm2, 0x20($r_ptr)
2563	movdqu	%xmm3, 0x30($r_ptr)
2564
2565	add	\$32*15+8, %rsp
2566	pop	%r15
2567	pop	%r14
2568	pop	%r13
2569	pop	%r12
2570	pop	%rbx
2571	pop	%rbp
2572	ret
2573.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
2574___
2575}
2576&gen_add_affine("q");
2577
2578########################################################################
2579# AD*X magic
2580#
2581if ($addx) {								{
2582########################################################################
2583# operate in 4-5-0-1 "name space" that matches multiplication output
2584#
2585my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2586
2587$code.=<<___;
2588.type	__ecp_nistz256_add_tox,\@abi-omnipotent
2589.align	32
2590__ecp_nistz256_add_tox:
2591	xor	$t4, $t4
2592	adc	8*0($b_ptr), $a0
2593	adc	8*1($b_ptr), $a1
2594	 mov	$a0, $t0
2595	adc	8*2($b_ptr), $a2
2596	adc	8*3($b_ptr), $a3
2597	 mov	$a1, $t1
2598	adc	\$0, $t4
2599
2600	xor	$t3, $t3
2601	sbb	\$-1, $a0
2602	 mov	$a2, $t2
2603	sbb	$poly1, $a1
2604	sbb	\$0, $a2
2605	 mov	$a3, $t3
2606	sbb	$poly3, $a3
2607
2608	bt	\$0, $t4
2609	cmovnc	$t0, $a0
2610	cmovnc	$t1, $a1
2611	mov	$a0, 8*0($r_ptr)
2612	cmovnc	$t2, $a2
2613	mov	$a1, 8*1($r_ptr)
2614	cmovnc	$t3, $a3
2615	mov	$a2, 8*2($r_ptr)
2616	mov	$a3, 8*3($r_ptr)
2617
2618	ret
2619.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
2620
2621.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
2622.align	32
2623__ecp_nistz256_sub_fromx:
2624	xor	$t4, $t4
2625	sbb	8*0($b_ptr), $a0
2626	sbb	8*1($b_ptr), $a1
2627	 mov	$a0, $t0
2628	sbb	8*2($b_ptr), $a2
2629	sbb	8*3($b_ptr), $a3
2630	 mov	$a1, $t1
2631	sbb	\$0, $t4
2632
2633	xor	$t3, $t3
2634	adc	\$-1, $a0
2635	 mov	$a2, $t2
2636	adc	$poly1, $a1
2637	adc	\$0, $a2
2638	 mov	$a3, $t3
2639	adc	$poly3, $a3
2640
2641	bt	\$0, $t4
2642	cmovnc	$t0, $a0
2643	cmovnc	$t1, $a1
2644	mov	$a0, 8*0($r_ptr)
2645	cmovnc	$t2, $a2
2646	mov	$a1, 8*1($r_ptr)
2647	cmovnc	$t3, $a3
2648	mov	$a2, 8*2($r_ptr)
2649	mov	$a3, 8*3($r_ptr)
2650
2651	ret
2652.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
2653
2654.type	__ecp_nistz256_subx,\@abi-omnipotent
2655.align	32
2656__ecp_nistz256_subx:
2657	xor	$t4, $t4
2658	sbb	$a0, $t0
2659	sbb	$a1, $t1
2660	 mov	$t0, $a0
2661	sbb	$a2, $t2
2662	sbb	$a3, $t3
2663	 mov	$t1, $a1
2664	sbb	\$0, $t4
2665
2666	xor	$a3 ,$a3
2667	adc	\$-1, $t0
2668	 mov	$t2, $a2
2669	adc	$poly1, $t1
2670	adc	\$0, $t2
2671	 mov	$t3, $a3
2672	adc	$poly3, $t3
2673
2674	bt	\$0, $t4
2675	cmovc	$t0, $a0
2676	cmovc	$t1, $a1
2677	cmovc	$t2, $a2
2678	cmovc	$t3, $a3
2679
2680	ret
2681.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
2682
2683.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
2684.align	32
2685__ecp_nistz256_mul_by_2x:
2686	xor	$t4, $t4
2687	adc	$a0, $a0		# a0:a3+a0:a3
2688	adc	$a1, $a1
2689	 mov	$a0, $t0
2690	adc	$a2, $a2
2691	adc	$a3, $a3
2692	 mov	$a1, $t1
2693	adc	\$0, $t4
2694
2695	xor	$t3, $t3
2696	sbb	\$-1, $a0
2697	 mov	$a2, $t2
2698	sbb	$poly1, $a1
2699	sbb	\$0, $a2
2700	 mov	$a3, $t3
2701	sbb	$poly3, $a3
2702
2703	bt	\$0, $t4
2704	cmovnc	$t0, $a0
2705	cmovnc	$t1, $a1
2706	mov	$a0, 8*0($r_ptr)
2707	cmovnc	$t2, $a2
2708	mov	$a1, 8*1($r_ptr)
2709	cmovnc	$t3, $a3
2710	mov	$a2, 8*2($r_ptr)
2711	mov	$a3, 8*3($r_ptr)
2712
2713	ret
2714.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
2715___
2716									}
2717&gen_double("x");
2718&gen_add("x");
2719&gen_add_affine("x");
2720}
2721}}}
2722
2723$code =~ s/\`([^\`]*)\`/eval $1/gem;
2724print $code;
2725close STDOUT;
2726