1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2015
11#
12# "Teaser" Montgomery multiplication module for ARMv8. Needs more
13# work. While it does improve RSA sign performance by 20-30% (less for
14# longer keys) on most processors, for some reason RSA2048 is not
15# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16# instruction issue rate is limited on processor in question, meaning
17# that dedicated squaring procedure is a must. Well, actually all
18# contemporary AArch64 processors seem to have limited multiplication
19# issue rate, i.e. they can't issue multiplication every cycle, which
20# explains moderate improvement coefficients in comparison to
21# compiler-generated code. Recall that compiler is instructed to use
22# umulh and therefore uses same amount of multiplication instructions
23# to do the job. Assembly's edge is to minimize number of "collateral"
24# instructions and of course instruction scheduling.
25#
26# April 2015
27#
28# Squaring procedure that handles lengths divisible by 8 improves
29# RSA/DSA performance by 25-40-60% depending on processor and key
30# length. Overall improvement coefficients are always positive in
31# comparison to compiler-generated code. On Cortex-A57 improvement
32# is still modest on longest key lengths, while others exhibit e.g.
33# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
34# on Cortex-A57 and ~60-100% faster on others.
35
36$flavour = shift;
37$output  = shift;
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42die "can't locate arm-xlate.pl";
43
44open OUT,"| \"$^X\" $xlate $flavour $output";
45*STDOUT=*OUT;
46
47($lo0,$hi0,$aj,$m0,$alo,$ahi,
48 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
49 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
50
51# int bn_mul_mont(
52$rp="x0";	# BN_ULONG *rp,
53$ap="x1";	# const BN_ULONG *ap,
54$bp="x2";	# const BN_ULONG *bp,
55$np="x3";	# const BN_ULONG *np,
56$n0="x4";	# const BN_ULONG *n0,
57$num="x5";	# int num);
58
59$code.=<<___;
60.text
61
62.globl	bn_mul_mont
63.type	bn_mul_mont,%function
64.align	5
65bn_mul_mont:
66	tst	$num,#7
67	b.eq	__bn_sqr8x_mont
68	tst	$num,#3
69	b.eq	__bn_mul4x_mont
70.Lmul_mont:
71	stp	x29,x30,[sp,#-64]!
72	add	x29,sp,#0
73	stp	x19,x20,[sp,#16]
74	stp	x21,x22,[sp,#32]
75	stp	x23,x24,[sp,#48]
76
77	ldr	$m0,[$bp],#8		// bp[0]
78	sub	$tp,sp,$num,lsl#3
79	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
80	lsl	$num,$num,#3
81	ldr	$n0,[$n0]		// *n0
82	and	$tp,$tp,#-16		// ABI says so
83	ldp	$hi1,$nj,[$np],#16	// np[0..1]
84
85	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
86	sub	$j,$num,#16		// j=num-2
87	umulh	$hi0,$hi0,$m0
88	mul	$alo,$aj,$m0		// ap[1]*bp[0]
89	umulh	$ahi,$aj,$m0
90
91	mul	$m1,$lo0,$n0		// "tp[0]"*n0
92	mov	sp,$tp			// alloca
93
94	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
95	umulh	$hi1,$hi1,$m1
96	mul	$nlo,$nj,$m1		// np[1]*m1
97	// (*)	adds	$lo1,$lo1,$lo0	// discarded
98	// (*)	As for removal of first multiplication and addition
99	//	instructions. The outcome of first addition is
100	//	guaranteed to be zero, which leaves two computationally
101	//	significant outcomes: it either carries or not. Then
102	//	question is when does it carry? Is there alternative
103	//	way to deduce it? If you follow operations, you can
104	//	observe that condition for carry is quite simple:
105	//	$lo0 being non-zero. So that carry can be calculated
106	//	by adding -1 to $lo0. That's what next instruction does.
107	subs	xzr,$lo0,#1		// (*)
108	umulh	$nhi,$nj,$m1
109	adc	$hi1,$hi1,xzr
110	cbz	$j,.L1st_skip
111
112.L1st:
113	ldr	$aj,[$ap],#8
114	adds	$lo0,$alo,$hi0
115	sub	$j,$j,#8		// j--
116	adc	$hi0,$ahi,xzr
117
118	ldr	$nj,[$np],#8
119	adds	$lo1,$nlo,$hi1
120	mul	$alo,$aj,$m0		// ap[j]*bp[0]
121	adc	$hi1,$nhi,xzr
122	umulh	$ahi,$aj,$m0
123
124	adds	$lo1,$lo1,$lo0
125	mul	$nlo,$nj,$m1		// np[j]*m1
126	adc	$hi1,$hi1,xzr
127	umulh	$nhi,$nj,$m1
128	str	$lo1,[$tp],#8		// tp[j-1]
129	cbnz	$j,.L1st
130
131.L1st_skip:
132	adds	$lo0,$alo,$hi0
133	sub	$ap,$ap,$num		// rewind $ap
134	adc	$hi0,$ahi,xzr
135
136	adds	$lo1,$nlo,$hi1
137	sub	$np,$np,$num		// rewind $np
138	adc	$hi1,$nhi,xzr
139
140	adds	$lo1,$lo1,$lo0
141	sub	$i,$num,#8		// i=num-1
142	adcs	$hi1,$hi1,$hi0
143
144	adc	$ovf,xzr,xzr		// upmost overflow bit
145	stp	$lo1,$hi1,[$tp]
146
147.Louter:
148	ldr	$m0,[$bp],#8		// bp[i]
149	ldp	$hi0,$aj,[$ap],#16
150	ldr	$tj,[sp]		// tp[0]
151	add	$tp,sp,#8
152
153	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
154	sub	$j,$num,#16		// j=num-2
155	umulh	$hi0,$hi0,$m0
156	ldp	$hi1,$nj,[$np],#16
157	mul	$alo,$aj,$m0		// ap[1]*bp[i]
158	adds	$lo0,$lo0,$tj
159	umulh	$ahi,$aj,$m0
160	adc	$hi0,$hi0,xzr
161
162	mul	$m1,$lo0,$n0
163	sub	$i,$i,#8		// i--
164
165	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
166	umulh	$hi1,$hi1,$m1
167	mul	$nlo,$nj,$m1		// np[1]*m1
168	// (*)	adds	$lo1,$lo1,$lo0
169	subs	xzr,$lo0,#1		// (*)
170	umulh	$nhi,$nj,$m1
171	cbz	$j,.Linner_skip
172
173.Linner:
174	ldr	$aj,[$ap],#8
175	adc	$hi1,$hi1,xzr
176	ldr	$tj,[$tp],#8		// tp[j]
177	adds	$lo0,$alo,$hi0
178	sub	$j,$j,#8		// j--
179	adc	$hi0,$ahi,xzr
180
181	adds	$lo1,$nlo,$hi1
182	ldr	$nj,[$np],#8
183	adc	$hi1,$nhi,xzr
184
185	mul	$alo,$aj,$m0		// ap[j]*bp[i]
186	adds	$lo0,$lo0,$tj
187	umulh	$ahi,$aj,$m0
188	adc	$hi0,$hi0,xzr
189
190	mul	$nlo,$nj,$m1		// np[j]*m1
191	adds	$lo1,$lo1,$lo0
192	umulh	$nhi,$nj,$m1
193	str	$lo1,[$tp,#-16]		// tp[j-1]
194	cbnz	$j,.Linner
195
196.Linner_skip:
197	ldr	$tj,[$tp],#8		// tp[j]
198	adc	$hi1,$hi1,xzr
199	adds	$lo0,$alo,$hi0
200	sub	$ap,$ap,$num		// rewind $ap
201	adc	$hi0,$ahi,xzr
202
203	adds	$lo1,$nlo,$hi1
204	sub	$np,$np,$num		// rewind $np
205	adcs	$hi1,$nhi,$ovf
206	adc	$ovf,xzr,xzr
207
208	adds	$lo0,$lo0,$tj
209	adc	$hi0,$hi0,xzr
210
211	adds	$lo1,$lo1,$lo0
212	adcs	$hi1,$hi1,$hi0
213	adc	$ovf,$ovf,xzr		// upmost overflow bit
214	stp	$lo1,$hi1,[$tp,#-16]
215
216	cbnz	$i,.Louter
217
218	// Final step. We see if result is larger than modulus, and
219	// if it is, subtract the modulus. But comparison implies
220	// subtraction. So we subtract modulus, see if it borrowed,
221	// and conditionally copy original value.
222	ldr	$tj,[sp]		// tp[0]
223	add	$tp,sp,#8
224	ldr	$nj,[$np],#8		// np[0]
225	subs	$j,$num,#8		// j=num-1 and clear borrow
226	mov	$ap,$rp
227.Lsub:
228	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
229	ldr	$tj,[$tp],#8
230	sub	$j,$j,#8		// j--
231	ldr	$nj,[$np],#8
232	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
233	cbnz	$j,.Lsub
234
235	sbcs	$aj,$tj,$nj
236	sbcs	$ovf,$ovf,xzr		// did it borrow?
237	str	$aj,[$ap],#8		// rp[num-1]
238
239	ldr	$tj,[sp]		// tp[0]
240	add	$tp,sp,#8
241	ldr	$aj,[$rp],#8		// rp[0]
242	sub	$num,$num,#8		// num--
243	nop
244.Lcond_copy:
245	sub	$num,$num,#8		// num--
246	csel	$nj,$tj,$aj,lo		// did it borrow?
247	ldr	$tj,[$tp],#8
248	ldr	$aj,[$rp],#8
249	str	xzr,[$tp,#-16]		// wipe tp
250	str	$nj,[$rp,#-16]
251	cbnz	$num,.Lcond_copy
252
253	csel	$nj,$tj,$aj,lo
254	str	xzr,[$tp,#-8]		// wipe tp
255	str	$nj,[$rp,#-8]
256
257	ldp	x19,x20,[x29,#16]
258	mov	sp,x29
259	ldp	x21,x22,[x29,#32]
260	mov	x0,#1
261	ldp	x23,x24,[x29,#48]
262	ldr	x29,[sp],#64
263	ret
264.size	bn_mul_mont,.-bn_mul_mont
265___
266{
267########################################################################
268# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
269
270my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
271my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
272my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
273my ($cnt,$carry,$topmost)=("x27","x28","x30");
274my ($tp,$ap_end,$na0)=($bp,$np,$carry);
275
276$code.=<<___;
277.type	__bn_sqr8x_mont,%function
278.align	5
279__bn_sqr8x_mont:
280	cmp	$ap,$bp
281	b.ne	__bn_mul4x_mont
282.Lsqr8x_mont:
283	stp	x29,x30,[sp,#-128]!
284	add	x29,sp,#0
285	stp	x19,x20,[sp,#16]
286	stp	x21,x22,[sp,#32]
287	stp	x23,x24,[sp,#48]
288	stp	x25,x26,[sp,#64]
289	stp	x27,x28,[sp,#80]
290	stp	$rp,$np,[sp,#96]	// offload rp and np
291
292	ldp	$a0,$a1,[$ap,#8*0]
293	ldp	$a2,$a3,[$ap,#8*2]
294	ldp	$a4,$a5,[$ap,#8*4]
295	ldp	$a6,$a7,[$ap,#8*6]
296
297	sub	$tp,sp,$num,lsl#4
298	lsl	$num,$num,#3
299	ldr	$n0,[$n0]		// *n0
300	mov	sp,$tp			// alloca
301	sub	$cnt,$num,#8*8
302	b	.Lsqr8x_zero_start
303
304.Lsqr8x_zero:
305	sub	$cnt,$cnt,#8*8
306	stp	xzr,xzr,[$tp,#8*0]
307	stp	xzr,xzr,[$tp,#8*2]
308	stp	xzr,xzr,[$tp,#8*4]
309	stp	xzr,xzr,[$tp,#8*6]
310.Lsqr8x_zero_start:
311	stp	xzr,xzr,[$tp,#8*8]
312	stp	xzr,xzr,[$tp,#8*10]
313	stp	xzr,xzr,[$tp,#8*12]
314	stp	xzr,xzr,[$tp,#8*14]
315	add	$tp,$tp,#8*16
316	cbnz	$cnt,.Lsqr8x_zero
317
318	add	$ap_end,$ap,$num
319	add	$ap,$ap,#8*8
320	mov	$acc0,xzr
321	mov	$acc1,xzr
322	mov	$acc2,xzr
323	mov	$acc3,xzr
324	mov	$acc4,xzr
325	mov	$acc5,xzr
326	mov	$acc6,xzr
327	mov	$acc7,xzr
328	mov	$tp,sp
329	str	$n0,[x29,#112]		// offload n0
330
331	// Multiply everything but a[i]*a[i]
332.align	4
333.Lsqr8x_outer_loop:
334        //                                                 a[1]a[0]	(i)
335        //                                             a[2]a[0]
336        //                                         a[3]a[0]
337        //                                     a[4]a[0]
338        //                                 a[5]a[0]
339        //                             a[6]a[0]
340        //                         a[7]a[0]
341        //                                         a[2]a[1]		(ii)
342        //                                     a[3]a[1]
343        //                                 a[4]a[1]
344        //                             a[5]a[1]
345        //                         a[6]a[1]
346        //                     a[7]a[1]
347        //                                 a[3]a[2]			(iii)
348        //                             a[4]a[2]
349        //                         a[5]a[2]
350        //                     a[6]a[2]
351        //                 a[7]a[2]
352        //                         a[4]a[3]				(iv)
353        //                     a[5]a[3]
354        //                 a[6]a[3]
355        //             a[7]a[3]
356        //                 a[5]a[4]					(v)
357        //             a[6]a[4]
358        //         a[7]a[4]
359        //         a[6]a[5]						(vi)
360        //     a[7]a[5]
361        // a[7]a[6]							(vii)
362
363	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
364	mul	$t1,$a2,$a0
365	mul	$t2,$a3,$a0
366	mul	$t3,$a4,$a0
367	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
368	mul	$t0,$a5,$a0
369	adcs	$acc2,$acc2,$t1
370	mul	$t1,$a6,$a0
371	adcs	$acc3,$acc3,$t2
372	mul	$t2,$a7,$a0
373	adcs	$acc4,$acc4,$t3
374	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
375	adcs	$acc5,$acc5,$t0
376	umulh	$t0,$a2,$a0
377	adcs	$acc6,$acc6,$t1
378	umulh	$t1,$a3,$a0
379	adcs	$acc7,$acc7,$t2
380	umulh	$t2,$a4,$a0
381	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
382	adc	$acc0,xzr,xzr		// t[8]
383	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
384	umulh	$t3,$a5,$a0
385	adcs	$acc3,$acc3,$t0
386	umulh	$t0,$a6,$a0
387	adcs	$acc4,$acc4,$t1
388	umulh	$t1,$a7,$a0
389	adcs	$acc5,$acc5,$t2
390	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
391	adcs	$acc6,$acc6,$t3
392	 mul	$t3,$a3,$a1
393	adcs	$acc7,$acc7,$t0
394	 mul	$t0,$a4,$a1
395	adc	$acc0,$acc0,$t1
396
397	mul	$t1,$a5,$a1
398	adds	$acc3,$acc3,$t2
399	mul	$t2,$a6,$a1
400	adcs	$acc4,$acc4,$t3
401	mul	$t3,$a7,$a1
402	adcs	$acc5,$acc5,$t0
403	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
404	adcs	$acc6,$acc6,$t1
405	umulh	$t1,$a3,$a1
406	adcs	$acc7,$acc7,$t2
407	umulh	$t2,$a4,$a1
408	adcs	$acc0,$acc0,$t3
409	umulh	$t3,$a5,$a1
410	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
411	adc	$acc1,xzr,xzr		// t[9]
412	adds	$acc4,$acc4,$t0
413	umulh	$t0,$a6,$a1
414	adcs	$acc5,$acc5,$t1
415	umulh	$t1,$a7,$a1
416	adcs	$acc6,$acc6,$t2
417	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
418	adcs	$acc7,$acc7,$t3
419	 mul	$t3,$a4,$a2
420	adcs	$acc0,$acc0,$t0
421	 mul	$t0,$a5,$a2
422	adc	$acc1,$acc1,$t1
423
424	mul	$t1,$a6,$a2
425	adds	$acc5,$acc5,$t2
426	mul	$t2,$a7,$a2
427	adcs	$acc6,$acc6,$t3
428	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
429	adcs	$acc7,$acc7,$t0
430	umulh	$t0,$a4,$a2
431	adcs	$acc0,$acc0,$t1
432	umulh	$t1,$a5,$a2
433	adcs	$acc1,$acc1,$t2
434	umulh	$t2,$a6,$a2
435	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
436	adc	$acc2,xzr,xzr		// t[10]
437	adds	$acc6,$acc6,$t3
438	umulh	$t3,$a7,$a2
439	adcs	$acc7,$acc7,$t0
440	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
441	adcs	$acc0,$acc0,$t1
442	 mul	$t1,$a5,$a3
443	adcs	$acc1,$acc1,$t2
444	 mul	$t2,$a6,$a3
445	adc	$acc2,$acc2,$t3
446
447	mul	$t3,$a7,$a3
448	adds	$acc7,$acc7,$t0
449	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
450	adcs	$acc0,$acc0,$t1
451	umulh	$t1,$a5,$a3
452	adcs	$acc1,$acc1,$t2
453	umulh	$t2,$a6,$a3
454	adcs	$acc2,$acc2,$t3
455	umulh	$t3,$a7,$a3
456	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
457	adc	$acc3,xzr,xzr		// t[11]
458	adds	$acc0,$acc0,$t0
459	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
460	adcs	$acc1,$acc1,$t1
461	 mul	$t1,$a6,$a4
462	adcs	$acc2,$acc2,$t2
463	 mul	$t2,$a7,$a4
464	adc	$acc3,$acc3,$t3
465
466	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
467	adds	$acc1,$acc1,$t0
468	umulh	$t0,$a6,$a4
469	adcs	$acc2,$acc2,$t1
470	umulh	$t1,$a7,$a4
471	adcs	$acc3,$acc3,$t2
472	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
473	adc	$acc4,xzr,xzr		// t[12]
474	adds	$acc2,$acc2,$t3
475	 mul	$t3,$a7,$a5
476	adcs	$acc3,$acc3,$t0
477	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
478	adc	$acc4,$acc4,$t1
479
480	umulh	$t1,$a7,$a5
481	adds	$acc3,$acc3,$t2
482	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
483	adcs	$acc4,$acc4,$t3
484	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
485	adc	$acc5,xzr,xzr		// t[13]
486	adds	$acc4,$acc4,$t0
487	sub	$cnt,$ap_end,$ap	// done yet?
488	adc	$acc5,$acc5,$t1
489
490	adds	$acc5,$acc5,$t2
491	sub	$t0,$ap_end,$num	// rewinded ap
492	adc	$acc6,xzr,xzr		// t[14]
493	add	$acc6,$acc6,$t3
494
495	cbz	$cnt,.Lsqr8x_outer_break
496
497	mov	$n0,$a0
498	ldp	$a0,$a1,[$tp,#8*0]
499	ldp	$a2,$a3,[$tp,#8*2]
500	ldp	$a4,$a5,[$tp,#8*4]
501	ldp	$a6,$a7,[$tp,#8*6]
502	adds	$acc0,$acc0,$a0
503	adcs	$acc1,$acc1,$a1
504	ldp	$a0,$a1,[$ap,#8*0]
505	adcs	$acc2,$acc2,$a2
506	adcs	$acc3,$acc3,$a3
507	ldp	$a2,$a3,[$ap,#8*2]
508	adcs	$acc4,$acc4,$a4
509	adcs	$acc5,$acc5,$a5
510	ldp	$a4,$a5,[$ap,#8*4]
511	adcs	$acc6,$acc6,$a6
512	mov	$rp,$ap
513	adcs	$acc7,xzr,$a7
514	ldp	$a6,$a7,[$ap,#8*6]
515	add	$ap,$ap,#8*8
516	//adc	$carry,xzr,xzr		// moved below
517	mov	$cnt,#-8*8
518
519	//                                                         a[8]a[0]
520	//                                                     a[9]a[0]
521	//                                                 a[a]a[0]
522	//                                             a[b]a[0]
523	//                                         a[c]a[0]
524	//                                     a[d]a[0]
525	//                                 a[e]a[0]
526	//                             a[f]a[0]
527	//                                                     a[8]a[1]
528	//                         a[f]a[1]........................
529	//                                                 a[8]a[2]
530	//                     a[f]a[2]........................
531	//                                             a[8]a[3]
532	//                 a[f]a[3]........................
533	//                                         a[8]a[4]
534	//             a[f]a[4]........................
535	//                                     a[8]a[5]
536	//         a[f]a[5]........................
537	//                                 a[8]a[6]
538	//     a[f]a[6]........................
539	//                             a[8]a[7]
540	// a[f]a[7]........................
541.Lsqr8x_mul:
542	mul	$t0,$a0,$n0
543	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
544	mul	$t1,$a1,$n0
545	add	$cnt,$cnt,#8
546	mul	$t2,$a2,$n0
547	mul	$t3,$a3,$n0
548	adds	$acc0,$acc0,$t0
549	mul	$t0,$a4,$n0
550	adcs	$acc1,$acc1,$t1
551	mul	$t1,$a5,$n0
552	adcs	$acc2,$acc2,$t2
553	mul	$t2,$a6,$n0
554	adcs	$acc3,$acc3,$t3
555	mul	$t3,$a7,$n0
556	adcs	$acc4,$acc4,$t0
557	umulh	$t0,$a0,$n0
558	adcs	$acc5,$acc5,$t1
559	umulh	$t1,$a1,$n0
560	adcs	$acc6,$acc6,$t2
561	umulh	$t2,$a2,$n0
562	adcs	$acc7,$acc7,$t3
563	umulh	$t3,$a3,$n0
564	adc	$carry,$carry,xzr
565	str	$acc0,[$tp],#8
566	adds	$acc0,$acc1,$t0
567	umulh	$t0,$a4,$n0
568	adcs	$acc1,$acc2,$t1
569	umulh	$t1,$a5,$n0
570	adcs	$acc2,$acc3,$t2
571	umulh	$t2,$a6,$n0
572	adcs	$acc3,$acc4,$t3
573	umulh	$t3,$a7,$n0
574	ldr	$n0,[$rp,$cnt]
575	adcs	$acc4,$acc5,$t0
576	adcs	$acc5,$acc6,$t1
577	adcs	$acc6,$acc7,$t2
578	adcs	$acc7,$carry,$t3
579	//adc	$carry,xzr,xzr		// moved above
580	cbnz	$cnt,.Lsqr8x_mul
581					// note that carry flag is guaranteed
582					// to be zero at this point
583	cmp	$ap,$ap_end		// done yet?
584	b.eq	.Lsqr8x_break
585
586	ldp	$a0,$a1,[$tp,#8*0]
587	ldp	$a2,$a3,[$tp,#8*2]
588	ldp	$a4,$a5,[$tp,#8*4]
589	ldp	$a6,$a7,[$tp,#8*6]
590	adds	$acc0,$acc0,$a0
591	ldr	$n0,[$rp,#-8*8]
592	adcs	$acc1,$acc1,$a1
593	ldp	$a0,$a1,[$ap,#8*0]
594	adcs	$acc2,$acc2,$a2
595	adcs	$acc3,$acc3,$a3
596	ldp	$a2,$a3,[$ap,#8*2]
597	adcs	$acc4,$acc4,$a4
598	adcs	$acc5,$acc5,$a5
599	ldp	$a4,$a5,[$ap,#8*4]
600	adcs	$acc6,$acc6,$a6
601	mov	$cnt,#-8*8
602	adcs	$acc7,$acc7,$a7
603	ldp	$a6,$a7,[$ap,#8*6]
604	add	$ap,$ap,#8*8
605	//adc	$carry,xzr,xzr		// moved above
606	b	.Lsqr8x_mul
607
608.align	4
609.Lsqr8x_break:
610	ldp	$a0,$a1,[$rp,#8*0]
611	add	$ap,$rp,#8*8
612	ldp	$a2,$a3,[$rp,#8*2]
613	sub	$t0,$ap_end,$ap		// is it last iteration?
614	ldp	$a4,$a5,[$rp,#8*4]
615	sub	$t1,$tp,$t0
616	ldp	$a6,$a7,[$rp,#8*6]
617	cbz	$t0,.Lsqr8x_outer_loop
618
619	stp	$acc0,$acc1,[$tp,#8*0]
620	ldp	$acc0,$acc1,[$t1,#8*0]
621	stp	$acc2,$acc3,[$tp,#8*2]
622	ldp	$acc2,$acc3,[$t1,#8*2]
623	stp	$acc4,$acc5,[$tp,#8*4]
624	ldp	$acc4,$acc5,[$t1,#8*4]
625	stp	$acc6,$acc7,[$tp,#8*6]
626	mov	$tp,$t1
627	ldp	$acc6,$acc7,[$t1,#8*6]
628	b	.Lsqr8x_outer_loop
629
630.align	4
631.Lsqr8x_outer_break:
632	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
633	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
634	ldp	$t1,$t2,[sp,#8*1]
635	ldp	$a5,$a7,[$t0,#8*2]
636	add	$ap,$t0,#8*4
637	ldp	$t3,$t0,[sp,#8*3]
638
639	stp	$acc0,$acc1,[$tp,#8*0]
640	mul	$acc0,$a1,$a1
641	stp	$acc2,$acc3,[$tp,#8*2]
642	umulh	$a1,$a1,$a1
643	stp	$acc4,$acc5,[$tp,#8*4]
644	mul	$a2,$a3,$a3
645	stp	$acc6,$acc7,[$tp,#8*6]
646	mov	$tp,sp
647	umulh	$a3,$a3,$a3
648	adds	$acc1,$a1,$t1,lsl#1
649	extr	$t1,$t2,$t1,#63
650	sub	$cnt,$num,#8*4
651
652.Lsqr4x_shift_n_add:
653	adcs	$acc2,$a2,$t1
654	extr	$t2,$t3,$t2,#63
655	sub	$cnt,$cnt,#8*4
656	adcs	$acc3,$a3,$t2
657	ldp	$t1,$t2,[$tp,#8*5]
658	mul	$a4,$a5,$a5
659	ldp	$a1,$a3,[$ap],#8*2
660	umulh	$a5,$a5,$a5
661	mul	$a6,$a7,$a7
662	umulh	$a7,$a7,$a7
663	extr	$t3,$t0,$t3,#63
664	stp	$acc0,$acc1,[$tp,#8*0]
665	adcs	$acc4,$a4,$t3
666	extr	$t0,$t1,$t0,#63
667	stp	$acc2,$acc3,[$tp,#8*2]
668	adcs	$acc5,$a5,$t0
669	ldp	$t3,$t0,[$tp,#8*7]
670	extr	$t1,$t2,$t1,#63
671	adcs	$acc6,$a6,$t1
672	extr	$t2,$t3,$t2,#63
673	adcs	$acc7,$a7,$t2
674	ldp	$t1,$t2,[$tp,#8*9]
675	mul	$a0,$a1,$a1
676	ldp	$a5,$a7,[$ap],#8*2
677	umulh	$a1,$a1,$a1
678	mul	$a2,$a3,$a3
679	umulh	$a3,$a3,$a3
680	stp	$acc4,$acc5,[$tp,#8*4]
681	extr	$t3,$t0,$t3,#63
682	stp	$acc6,$acc7,[$tp,#8*6]
683	add	$tp,$tp,#8*8
684	adcs	$acc0,$a0,$t3
685	extr	$t0,$t1,$t0,#63
686	adcs	$acc1,$a1,$t0
687	ldp	$t3,$t0,[$tp,#8*3]
688	extr	$t1,$t2,$t1,#63
689	cbnz	$cnt,.Lsqr4x_shift_n_add
690___
691my ($np,$np_end)=($ap,$ap_end);
692$code.=<<___;
693	 ldp	$np,$n0,[x29,#104]	// pull np and n0
694
695	adcs	$acc2,$a2,$t1
696	extr	$t2,$t3,$t2,#63
697	adcs	$acc3,$a3,$t2
698	ldp	$t1,$t2,[$tp,#8*5]
699	mul	$a4,$a5,$a5
700	umulh	$a5,$a5,$a5
701	stp	$acc0,$acc1,[$tp,#8*0]
702	mul	$a6,$a7,$a7
703	umulh	$a7,$a7,$a7
704	stp	$acc2,$acc3,[$tp,#8*2]
705	extr	$t3,$t0,$t3,#63
706	adcs	$acc4,$a4,$t3
707	extr	$t0,$t1,$t0,#63
708	 ldp	$acc0,$acc1,[sp,#8*0]
709	adcs	$acc5,$a5,$t0
710	extr	$t1,$t2,$t1,#63
711	 ldp	$a0,$a1,[$np,#8*0]
712	adcs	$acc6,$a6,$t1
713	extr	$t2,xzr,$t2,#63
714	 ldp	$a2,$a3,[$np,#8*2]
715	adc	$acc7,$a7,$t2
716	 ldp	$a4,$a5,[$np,#8*4]
717
718	// Reduce by 512 bits per iteration
719	mul	$na0,$n0,$acc0		// t[0]*n0
720	ldp	$a6,$a7,[$np,#8*6]
721	add	$np_end,$np,$num
722	ldp	$acc2,$acc3,[sp,#8*2]
723	stp	$acc4,$acc5,[$tp,#8*4]
724	ldp	$acc4,$acc5,[sp,#8*4]
725	stp	$acc6,$acc7,[$tp,#8*6]
726	ldp	$acc6,$acc7,[sp,#8*6]
727	add	$np,$np,#8*8
728	mov	$topmost,xzr		// initial top-most carry
729	mov	$tp,sp
730	mov	$cnt,#8
731
732.Lsqr8x_reduction:
733	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
734	mul	$t1,$a1,$na0
735	sub	$cnt,$cnt,#1
736	mul	$t2,$a2,$na0
737	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
738	mul	$t3,$a3,$na0
739	// (*)	adds	xzr,$acc0,$t0
740	subs	xzr,$acc0,#1		// (*)
741	mul	$t0,$a4,$na0
742	adcs	$acc0,$acc1,$t1
743	mul	$t1,$a5,$na0
744	adcs	$acc1,$acc2,$t2
745	mul	$t2,$a6,$na0
746	adcs	$acc2,$acc3,$t3
747	mul	$t3,$a7,$na0
748	adcs	$acc3,$acc4,$t0
749	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
750	adcs	$acc4,$acc5,$t1
751	umulh	$t1,$a1,$na0
752	adcs	$acc5,$acc6,$t2
753	umulh	$t2,$a2,$na0
754	adcs	$acc6,$acc7,$t3
755	umulh	$t3,$a3,$na0
756	adc	$acc7,xzr,xzr
757	adds	$acc0,$acc0,$t0
758	umulh	$t0,$a4,$na0
759	adcs	$acc1,$acc1,$t1
760	umulh	$t1,$a5,$na0
761	adcs	$acc2,$acc2,$t2
762	umulh	$t2,$a6,$na0
763	adcs	$acc3,$acc3,$t3
764	umulh	$t3,$a7,$na0
765	mul	$na0,$n0,$acc0		// next t[0]*n0
766	adcs	$acc4,$acc4,$t0
767	adcs	$acc5,$acc5,$t1
768	adcs	$acc6,$acc6,$t2
769	adc	$acc7,$acc7,$t3
770	cbnz	$cnt,.Lsqr8x_reduction
771
772	ldp	$t0,$t1,[$tp,#8*0]
773	ldp	$t2,$t3,[$tp,#8*2]
774	mov	$rp,$tp
775	sub	$cnt,$np_end,$np	// done yet?
776	adds	$acc0,$acc0,$t0
777	adcs	$acc1,$acc1,$t1
778	ldp	$t0,$t1,[$tp,#8*4]
779	adcs	$acc2,$acc2,$t2
780	adcs	$acc3,$acc3,$t3
781	ldp	$t2,$t3,[$tp,#8*6]
782	adcs	$acc4,$acc4,$t0
783	adcs	$acc5,$acc5,$t1
784	adcs	$acc6,$acc6,$t2
785	adcs	$acc7,$acc7,$t3
786	//adc	$carry,xzr,xzr		// moved below
787	cbz	$cnt,.Lsqr8x8_post_condition
788
789	ldr	$n0,[$tp,#-8*8]
790	ldp	$a0,$a1,[$np,#8*0]
791	ldp	$a2,$a3,[$np,#8*2]
792	ldp	$a4,$a5,[$np,#8*4]
793	mov	$cnt,#-8*8
794	ldp	$a6,$a7,[$np,#8*6]
795	add	$np,$np,#8*8
796
797.Lsqr8x_tail:
798	mul	$t0,$a0,$n0
799	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
800	mul	$t1,$a1,$n0
801	add	$cnt,$cnt,#8
802	mul	$t2,$a2,$n0
803	mul	$t3,$a3,$n0
804	adds	$acc0,$acc0,$t0
805	mul	$t0,$a4,$n0
806	adcs	$acc1,$acc1,$t1
807	mul	$t1,$a5,$n0
808	adcs	$acc2,$acc2,$t2
809	mul	$t2,$a6,$n0
810	adcs	$acc3,$acc3,$t3
811	mul	$t3,$a7,$n0
812	adcs	$acc4,$acc4,$t0
813	umulh	$t0,$a0,$n0
814	adcs	$acc5,$acc5,$t1
815	umulh	$t1,$a1,$n0
816	adcs	$acc6,$acc6,$t2
817	umulh	$t2,$a2,$n0
818	adcs	$acc7,$acc7,$t3
819	umulh	$t3,$a3,$n0
820	adc	$carry,$carry,xzr
821	str	$acc0,[$tp],#8
822	adds	$acc0,$acc1,$t0
823	umulh	$t0,$a4,$n0
824	adcs	$acc1,$acc2,$t1
825	umulh	$t1,$a5,$n0
826	adcs	$acc2,$acc3,$t2
827	umulh	$t2,$a6,$n0
828	adcs	$acc3,$acc4,$t3
829	umulh	$t3,$a7,$n0
830	ldr	$n0,[$rp,$cnt]
831	adcs	$acc4,$acc5,$t0
832	adcs	$acc5,$acc6,$t1
833	adcs	$acc6,$acc7,$t2
834	adcs	$acc7,$carry,$t3
835	//adc	$carry,xzr,xzr		// moved above
836	cbnz	$cnt,.Lsqr8x_tail
837					// note that carry flag is guaranteed
838					// to be zero at this point
839	ldp	$a0,$a1,[$tp,#8*0]
840	sub	$cnt,$np_end,$np	// done yet?
841	sub	$t2,$np_end,$num	// rewinded np
842	ldp	$a2,$a3,[$tp,#8*2]
843	ldp	$a4,$a5,[$tp,#8*4]
844	ldp	$a6,$a7,[$tp,#8*6]
845	cbz	$cnt,.Lsqr8x_tail_break
846
847	ldr	$n0,[$rp,#-8*8]
848	adds	$acc0,$acc0,$a0
849	adcs	$acc1,$acc1,$a1
850	ldp	$a0,$a1,[$np,#8*0]
851	adcs	$acc2,$acc2,$a2
852	adcs	$acc3,$acc3,$a3
853	ldp	$a2,$a3,[$np,#8*2]
854	adcs	$acc4,$acc4,$a4
855	adcs	$acc5,$acc5,$a5
856	ldp	$a4,$a5,[$np,#8*4]
857	adcs	$acc6,$acc6,$a6
858	mov	$cnt,#-8*8
859	adcs	$acc7,$acc7,$a7
860	ldp	$a6,$a7,[$np,#8*6]
861	add	$np,$np,#8*8
862	//adc	$carry,xzr,xzr		// moved above
863	b	.Lsqr8x_tail
864
865.align	4
866.Lsqr8x_tail_break:
867	ldr	$n0,[x29,#112]		// pull n0
868	add	$cnt,$tp,#8*8		// end of current t[num] window
869
870	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
871	adcs	$t0,$acc0,$a0
872	adcs	$t1,$acc1,$a1
873	ldp	$acc0,$acc1,[$rp,#8*0]
874	adcs	$acc2,$acc2,$a2
875	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
876	adcs	$acc3,$acc3,$a3
877	ldp	$a2,$a3,[$t2,#8*2]
878	adcs	$acc4,$acc4,$a4
879	adcs	$acc5,$acc5,$a5
880	ldp	$a4,$a5,[$t2,#8*4]
881	adcs	$acc6,$acc6,$a6
882	adcs	$acc7,$acc7,$a7
883	ldp	$a6,$a7,[$t2,#8*6]
884	add	$np,$t2,#8*8
885	adc	$topmost,xzr,xzr	// top-most carry
886	mul	$na0,$n0,$acc0
887	stp	$t0,$t1,[$tp,#8*0]
888	stp	$acc2,$acc3,[$tp,#8*2]
889	ldp	$acc2,$acc3,[$rp,#8*2]
890	stp	$acc4,$acc5,[$tp,#8*4]
891	ldp	$acc4,$acc5,[$rp,#8*4]
892	cmp	$cnt,x29		// did we hit the bottom?
893	stp	$acc6,$acc7,[$tp,#8*6]
894	mov	$tp,$rp			// slide the window
895	ldp	$acc6,$acc7,[$rp,#8*6]
896	mov	$cnt,#8
897	b.ne	.Lsqr8x_reduction
898
899	// Final step. We see if result is larger than modulus, and
900	// if it is, subtract the modulus. But comparison implies
901	// subtraction. So we subtract modulus, see if it borrowed,
902	// and conditionally copy original value.
903	ldr	$rp,[x29,#96]		// pull rp
904	add	$tp,$tp,#8*8
905	subs	$t0,$acc0,$a0
906	sbcs	$t1,$acc1,$a1
907	sub	$cnt,$num,#8*8
908	mov	$ap_end,$rp		// $rp copy
909
910.Lsqr8x_sub:
911	sbcs	$t2,$acc2,$a2
912	ldp	$a0,$a1,[$np,#8*0]
913	sbcs	$t3,$acc3,$a3
914	stp	$t0,$t1,[$rp,#8*0]
915	sbcs	$t0,$acc4,$a4
916	ldp	$a2,$a3,[$np,#8*2]
917	sbcs	$t1,$acc5,$a5
918	stp	$t2,$t3,[$rp,#8*2]
919	sbcs	$t2,$acc6,$a6
920	ldp	$a4,$a5,[$np,#8*4]
921	sbcs	$t3,$acc7,$a7
922	ldp	$a6,$a7,[$np,#8*6]
923	add	$np,$np,#8*8
924	ldp	$acc0,$acc1,[$tp,#8*0]
925	sub	$cnt,$cnt,#8*8
926	ldp	$acc2,$acc3,[$tp,#8*2]
927	ldp	$acc4,$acc5,[$tp,#8*4]
928	ldp	$acc6,$acc7,[$tp,#8*6]
929	add	$tp,$tp,#8*8
930	stp	$t0,$t1,[$rp,#8*4]
931	sbcs	$t0,$acc0,$a0
932	stp	$t2,$t3,[$rp,#8*6]
933	add	$rp,$rp,#8*8
934	sbcs	$t1,$acc1,$a1
935	cbnz	$cnt,.Lsqr8x_sub
936
937	sbcs	$t2,$acc2,$a2
938	 mov	$tp,sp
939	 add	$ap,sp,$num
940	 ldp	$a0,$a1,[$ap_end,#8*0]
941	sbcs	$t3,$acc3,$a3
942	stp	$t0,$t1,[$rp,#8*0]
943	sbcs	$t0,$acc4,$a4
944	 ldp	$a2,$a3,[$ap_end,#8*2]
945	sbcs	$t1,$acc5,$a5
946	stp	$t2,$t3,[$rp,#8*2]
947	sbcs	$t2,$acc6,$a6
948	 ldp	$acc0,$acc1,[$ap,#8*0]
949	sbcs	$t3,$acc7,$a7
950	 ldp	$acc2,$acc3,[$ap,#8*2]
951	sbcs	xzr,$topmost,xzr	// did it borrow?
952	ldr	x30,[x29,#8]		// pull return address
953	stp	$t0,$t1,[$rp,#8*4]
954	stp	$t2,$t3,[$rp,#8*6]
955
956	sub	$cnt,$num,#8*4
957.Lsqr4x_cond_copy:
958	sub	$cnt,$cnt,#8*4
959	csel	$t0,$acc0,$a0,lo
960	 stp	xzr,xzr,[$tp,#8*0]
961	csel	$t1,$acc1,$a1,lo
962	ldp	$a0,$a1,[$ap_end,#8*4]
963	ldp	$acc0,$acc1,[$ap,#8*4]
964	csel	$t2,$acc2,$a2,lo
965	 stp	xzr,xzr,[$tp,#8*2]
966	 add	$tp,$tp,#8*4
967	csel	$t3,$acc3,$a3,lo
968	ldp	$a2,$a3,[$ap_end,#8*6]
969	ldp	$acc2,$acc3,[$ap,#8*6]
970	add	$ap,$ap,#8*4
971	stp	$t0,$t1,[$ap_end,#8*0]
972	stp	$t2,$t3,[$ap_end,#8*2]
973	add	$ap_end,$ap_end,#8*4
974	 stp	xzr,xzr,[$ap,#8*0]
975	 stp	xzr,xzr,[$ap,#8*2]
976	cbnz	$cnt,.Lsqr4x_cond_copy
977
978	csel	$t0,$acc0,$a0,lo
979	 stp	xzr,xzr,[$tp,#8*0]
980	csel	$t1,$acc1,$a1,lo
981	 stp	xzr,xzr,[$tp,#8*2]
982	csel	$t2,$acc2,$a2,lo
983	csel	$t3,$acc3,$a3,lo
984	stp	$t0,$t1,[$ap_end,#8*0]
985	stp	$t2,$t3,[$ap_end,#8*2]
986
987	b	.Lsqr8x_done
988
989.align	4
990.Lsqr8x8_post_condition:
991	adc	$carry,xzr,xzr
992	ldr	x30,[x29,#8]		// pull return address
993	// $acc0-7,$carry hold result, $a0-7 hold modulus
994	subs	$a0,$acc0,$a0
995	ldr	$ap,[x29,#96]		// pull rp
996	sbcs	$a1,$acc1,$a1
997	 stp	xzr,xzr,[sp,#8*0]
998	sbcs	$a2,$acc2,$a2
999	 stp	xzr,xzr,[sp,#8*2]
1000	sbcs	$a3,$acc3,$a3
1001	 stp	xzr,xzr,[sp,#8*4]
1002	sbcs	$a4,$acc4,$a4
1003	 stp	xzr,xzr,[sp,#8*6]
1004	sbcs	$a5,$acc5,$a5
1005	 stp	xzr,xzr,[sp,#8*8]
1006	sbcs	$a6,$acc6,$a6
1007	 stp	xzr,xzr,[sp,#8*10]
1008	sbcs	$a7,$acc7,$a7
1009	 stp	xzr,xzr,[sp,#8*12]
1010	sbcs	$carry,$carry,xzr	// did it borrow?
1011	 stp	xzr,xzr,[sp,#8*14]
1012
1013	// $a0-7 hold result-modulus
1014	csel	$a0,$acc0,$a0,lo
1015	csel	$a1,$acc1,$a1,lo
1016	csel	$a2,$acc2,$a2,lo
1017	csel	$a3,$acc3,$a3,lo
1018	stp	$a0,$a1,[$ap,#8*0]
1019	csel	$a4,$acc4,$a4,lo
1020	csel	$a5,$acc5,$a5,lo
1021	stp	$a2,$a3,[$ap,#8*2]
1022	csel	$a6,$acc6,$a6,lo
1023	csel	$a7,$acc7,$a7,lo
1024	stp	$a4,$a5,[$ap,#8*4]
1025	stp	$a6,$a7,[$ap,#8*6]
1026
1027.Lsqr8x_done:
1028	ldp	x19,x20,[x29,#16]
1029	mov	sp,x29
1030	ldp	x21,x22,[x29,#32]
1031	mov	x0,#1
1032	ldp	x23,x24,[x29,#48]
1033	ldp	x25,x26,[x29,#64]
1034	ldp	x27,x28,[x29,#80]
1035	ldr	x29,[sp],#128
1036	ret
1037.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1038___
1039}
1040
1041{
1042########################################################################
1043# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1044# x86_64-mont5 module, it's different in sense that it performs
1045# reduction 256 bits at a time.
1046
1047my ($a0,$a1,$a2,$a3,
1048    $t0,$t1,$t2,$t3,
1049    $m0,$m1,$m2,$m3,
1050    $acc0,$acc1,$acc2,$acc3,$acc4,
1051    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1052my  $bp_end=$rp;
1053my  ($carry,$topmost) = ($rp,"x30");
1054
1055$code.=<<___;
1056.type	__bn_mul4x_mont,%function
1057.align	5
1058__bn_mul4x_mont:
1059	stp	x29,x30,[sp,#-128]!
1060	add	x29,sp,#0
1061	stp	x19,x20,[sp,#16]
1062	stp	x21,x22,[sp,#32]
1063	stp	x23,x24,[sp,#48]
1064	stp	x25,x26,[sp,#64]
1065	stp	x27,x28,[sp,#80]
1066
1067	sub	$tp,sp,$num,lsl#3
1068	lsl	$num,$num,#3
1069	ldr	$n0,[$n0]		// *n0
1070	sub	sp,$tp,#8*4		// alloca
1071
1072	add	$t0,$bp,$num
1073	add	$ap_end,$ap,$num
1074	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
1075
1076	ldr	$bi,[$bp,#8*0]		// b[0]
1077	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1078	ldp	$a2,$a3,[$ap,#8*2]
1079	add	$ap,$ap,#8*4
1080	mov	$acc0,xzr
1081	mov	$acc1,xzr
1082	mov	$acc2,xzr
1083	mov	$acc3,xzr
1084	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1085	ldp	$m2,$m3,[$np,#8*2]
1086	adds	$np,$np,#8*4		// clear carry bit
1087	mov	$carry,xzr
1088	mov	$cnt,#0
1089	mov	$tp,sp
1090
1091.Loop_mul4x_1st_reduction:
1092	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
1093	adc	$carry,$carry,xzr	// modulo-scheduled
1094	mul	$t1,$a1,$bi
1095	add	$cnt,$cnt,#8
1096	mul	$t2,$a2,$bi
1097	and	$cnt,$cnt,#31
1098	mul	$t3,$a3,$bi
1099	adds	$acc0,$acc0,$t0
1100	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
1101	adcs	$acc1,$acc1,$t1
1102	mul	$mi,$acc0,$n0		// t[0]*n0
1103	adcs	$acc2,$acc2,$t2
1104	umulh	$t1,$a1,$bi
1105	adcs	$acc3,$acc3,$t3
1106	umulh	$t2,$a2,$bi
1107	adc	$acc4,xzr,xzr
1108	umulh	$t3,$a3,$bi
1109	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1110	adds	$acc1,$acc1,$t0
1111	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
1112	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1113	adcs	$acc2,$acc2,$t1
1114	mul	$t1,$m1,$mi
1115	adcs	$acc3,$acc3,$t2
1116	mul	$t2,$m2,$mi
1117	adc	$acc4,$acc4,$t3		// can't overflow
1118	mul	$t3,$m3,$mi
1119	// (*)	adds	xzr,$acc0,$t0
1120	subs	xzr,$acc0,#1		// (*)
1121	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
1122	adcs	$acc0,$acc1,$t1
1123	umulh	$t1,$m1,$mi
1124	adcs	$acc1,$acc2,$t2
1125	umulh	$t2,$m2,$mi
1126	adcs	$acc2,$acc3,$t3
1127	umulh	$t3,$m3,$mi
1128	adcs	$acc3,$acc4,$carry
1129	adc	$carry,xzr,xzr
1130	adds	$acc0,$acc0,$t0
1131	sub	$t0,$ap_end,$ap
1132	adcs	$acc1,$acc1,$t1
1133	adcs	$acc2,$acc2,$t2
1134	adcs	$acc3,$acc3,$t3
1135	//adc	$carry,$carry,xzr
1136	cbnz	$cnt,.Loop_mul4x_1st_reduction
1137
1138	cbz	$t0,.Lmul4x4_post_condition
1139
1140	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1141	ldp	$a2,$a3,[$ap,#8*2]
1142	add	$ap,$ap,#8*4
1143	ldr	$mi,[sp]		// a[0]*n0
1144	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1145	ldp	$m2,$m3,[$np,#8*2]
1146	add	$np,$np,#8*4
1147
1148.Loop_mul4x_1st_tail:
1149	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
1150	adc	$carry,$carry,xzr	// modulo-scheduled
1151	mul	$t1,$a1,$bi
1152	add	$cnt,$cnt,#8
1153	mul	$t2,$a2,$bi
1154	and	$cnt,$cnt,#31
1155	mul	$t3,$a3,$bi
1156	adds	$acc0,$acc0,$t0
1157	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
1158	adcs	$acc1,$acc1,$t1
1159	umulh	$t1,$a1,$bi
1160	adcs	$acc2,$acc2,$t2
1161	umulh	$t2,$a2,$bi
1162	adcs	$acc3,$acc3,$t3
1163	umulh	$t3,$a3,$bi
1164	adc	$acc4,xzr,xzr
1165	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1166	adds	$acc1,$acc1,$t0
1167	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
1168	adcs	$acc2,$acc2,$t1
1169	mul	$t1,$m1,$mi
1170	adcs	$acc3,$acc3,$t2
1171	mul	$t2,$m2,$mi
1172	adc	$acc4,$acc4,$t3		// can't overflow
1173	mul	$t3,$m3,$mi
1174	adds	$acc0,$acc0,$t0
1175	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
1176	adcs	$acc1,$acc1,$t1
1177	umulh	$t1,$m1,$mi
1178	adcs	$acc2,$acc2,$t2
1179	umulh	$t2,$m2,$mi
1180	adcs	$acc3,$acc3,$t3
1181	adcs	$acc4,$acc4,$carry
1182	umulh	$t3,$m3,$mi
1183	adc	$carry,xzr,xzr
1184	ldr	$mi,[sp,$cnt]		// next t[0]*n0
1185	str	$acc0,[$tp],#8		// result!!!
1186	adds	$acc0,$acc1,$t0
1187	sub	$t0,$ap_end,$ap		// done yet?
1188	adcs	$acc1,$acc2,$t1
1189	adcs	$acc2,$acc3,$t2
1190	adcs	$acc3,$acc4,$t3
1191	//adc	$carry,$carry,xzr
1192	cbnz	$cnt,.Loop_mul4x_1st_tail
1193
1194	sub	$t1,$ap_end,$num	// rewinded $ap
1195	cbz	$t0,.Lmul4x_proceed
1196
1197	ldp	$a0,$a1,[$ap,#8*0]
1198	ldp	$a2,$a3,[$ap,#8*2]
1199	add	$ap,$ap,#8*4
1200	ldp	$m0,$m1,[$np,#8*0]
1201	ldp	$m2,$m3,[$np,#8*2]
1202	add	$np,$np,#8*4
1203	b	.Loop_mul4x_1st_tail
1204
1205.align	5
1206.Lmul4x_proceed:
1207	ldr	$bi,[$bp,#8*4]!		// *++b
1208	adc	$topmost,$carry,xzr
1209	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
1210	sub	$np,$np,$num		// rewind np
1211	ldp	$a2,$a3,[$t1,#8*2]
1212	add	$ap,$t1,#8*4
1213
1214	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1215	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1216	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1217	ldp	$acc2,$acc3,[sp,#8*6]
1218
1219	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1220	mov	$tp,sp
1221	ldp	$m2,$m3,[$np,#8*2]
1222	adds	$np,$np,#8*4		// clear carry bit
1223	mov	$carry,xzr
1224
1225.align	4
1226.Loop_mul4x_reduction:
1227	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
1228	adc	$carry,$carry,xzr	// modulo-scheduled
1229	mul	$t1,$a1,$bi
1230	add	$cnt,$cnt,#8
1231	mul	$t2,$a2,$bi
1232	and	$cnt,$cnt,#31
1233	mul	$t3,$a3,$bi
1234	adds	$acc0,$acc0,$t0
1235	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
1236	adcs	$acc1,$acc1,$t1
1237	mul	$mi,$acc0,$n0		// t[0]*n0
1238	adcs	$acc2,$acc2,$t2
1239	umulh	$t1,$a1,$bi
1240	adcs	$acc3,$acc3,$t3
1241	umulh	$t2,$a2,$bi
1242	adc	$acc4,xzr,xzr
1243	umulh	$t3,$a3,$bi
1244	ldr	$bi,[$bp,$cnt]		// next b[i]
1245	adds	$acc1,$acc1,$t0
1246	// (*)	mul	$t0,$m0,$mi
1247	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1248	adcs	$acc2,$acc2,$t1
1249	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
1250	adcs	$acc3,$acc3,$t2
1251	mul	$t2,$m2,$mi
1252	adc	$acc4,$acc4,$t3		// can't overflow
1253	mul	$t3,$m3,$mi
1254	// (*)	adds	xzr,$acc0,$t0
1255	subs	xzr,$acc0,#1		// (*)
1256	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
1257	adcs	$acc0,$acc1,$t1
1258	umulh	$t1,$m1,$mi
1259	adcs	$acc1,$acc2,$t2
1260	umulh	$t2,$m2,$mi
1261	adcs	$acc2,$acc3,$t3
1262	umulh	$t3,$m3,$mi
1263	adcs	$acc3,$acc4,$carry
1264	adc	$carry,xzr,xzr
1265	adds	$acc0,$acc0,$t0
1266	adcs	$acc1,$acc1,$t1
1267	adcs	$acc2,$acc2,$t2
1268	adcs	$acc3,$acc3,$t3
1269	//adc	$carry,$carry,xzr
1270	cbnz	$cnt,.Loop_mul4x_reduction
1271
1272	adc	$carry,$carry,xzr
1273	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
1274	ldp	$t2,$t3,[$tp,#8*6]
1275	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1276	ldp	$a2,$a3,[$ap,#8*2]
1277	add	$ap,$ap,#8*4
1278	adds	$acc0,$acc0,$t0
1279	adcs	$acc1,$acc1,$t1
1280	adcs	$acc2,$acc2,$t2
1281	adcs	$acc3,$acc3,$t3
1282	//adc	$carry,$carry,xzr
1283
1284	ldr	$mi,[sp]		// t[0]*n0
1285	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1286	ldp	$m2,$m3,[$np,#8*2]
1287	add	$np,$np,#8*4
1288
1289.align	4
1290.Loop_mul4x_tail:
1291	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
1292	adc	$carry,$carry,xzr	// modulo-scheduled
1293	mul	$t1,$a1,$bi
1294	add	$cnt,$cnt,#8
1295	mul	$t2,$a2,$bi
1296	and	$cnt,$cnt,#31
1297	mul	$t3,$a3,$bi
1298	adds	$acc0,$acc0,$t0
1299	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
1300	adcs	$acc1,$acc1,$t1
1301	umulh	$t1,$a1,$bi
1302	adcs	$acc2,$acc2,$t2
1303	umulh	$t2,$a2,$bi
1304	adcs	$acc3,$acc3,$t3
1305	umulh	$t3,$a3,$bi
1306	adc	$acc4,xzr,xzr
1307	ldr	$bi,[$bp,$cnt]		// next b[i]
1308	adds	$acc1,$acc1,$t0
1309	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
1310	adcs	$acc2,$acc2,$t1
1311	mul	$t1,$m1,$mi
1312	adcs	$acc3,$acc3,$t2
1313	mul	$t2,$m2,$mi
1314	adc	$acc4,$acc4,$t3		// can't overflow
1315	mul	$t3,$m3,$mi
1316	adds	$acc0,$acc0,$t0
1317	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
1318	adcs	$acc1,$acc1,$t1
1319	umulh	$t1,$m1,$mi
1320	adcs	$acc2,$acc2,$t2
1321	umulh	$t2,$m2,$mi
1322	adcs	$acc3,$acc3,$t3
1323	umulh	$t3,$m3,$mi
1324	adcs	$acc4,$acc4,$carry
1325	ldr	$mi,[sp,$cnt]		// next a[0]*n0
1326	adc	$carry,xzr,xzr
1327	str	$acc0,[$tp],#8		// result!!!
1328	adds	$acc0,$acc1,$t0
1329	sub	$t0,$ap_end,$ap		// done yet?
1330	adcs	$acc1,$acc2,$t1
1331	adcs	$acc2,$acc3,$t2
1332	adcs	$acc3,$acc4,$t3
1333	//adc	$carry,$carry,xzr
1334	cbnz	$cnt,.Loop_mul4x_tail
1335
1336	sub	$t1,$np,$num		// rewinded np?
1337	adc	$carry,$carry,xzr
1338	cbz	$t0,.Loop_mul4x_break
1339
1340	ldp	$t0,$t1,[$tp,#8*4]
1341	ldp	$t2,$t3,[$tp,#8*6]
1342	ldp	$a0,$a1,[$ap,#8*0]
1343	ldp	$a2,$a3,[$ap,#8*2]
1344	add	$ap,$ap,#8*4
1345	adds	$acc0,$acc0,$t0
1346	adcs	$acc1,$acc1,$t1
1347	adcs	$acc2,$acc2,$t2
1348	adcs	$acc3,$acc3,$t3
1349	//adc	$carry,$carry,xzr
1350	ldp	$m0,$m1,[$np,#8*0]
1351	ldp	$m2,$m3,[$np,#8*2]
1352	add	$np,$np,#8*4
1353	b	.Loop_mul4x_tail
1354
1355.align	4
1356.Loop_mul4x_break:
1357	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
1358	adds	$acc0,$acc0,$topmost
1359	add	$bp,$bp,#8*4		// bp++
1360	adcs	$acc1,$acc1,xzr
1361	sub	$ap,$ap,$num		// rewind ap
1362	adcs	$acc2,$acc2,xzr
1363	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1364	adcs	$acc3,$acc3,xzr
1365	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1366	adc	$topmost,$carry,xzr
1367	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1368	cmp	$bp,$t3			// done yet?
1369	ldp	$acc2,$acc3,[sp,#8*6]
1370	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
1371	ldp	$m2,$m3,[$t1,#8*2]
1372	add	$np,$t1,#8*4
1373	b.eq	.Lmul4x_post
1374
1375	ldr	$bi,[$bp]
1376	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1377	ldp	$a2,$a3,[$ap,#8*2]
1378	adds	$ap,$ap,#8*4		// clear carry bit
1379	mov	$carry,xzr
1380	mov	$tp,sp
1381	b	.Loop_mul4x_reduction
1382
1383.align	4
1384.Lmul4x_post:
1385	// Final step. We see if result is larger than modulus, and
1386	// if it is, subtract the modulus. But comparison implies
1387	// subtraction. So we subtract modulus, see if it borrowed,
1388	// and conditionally copy original value.
1389	mov	$rp,$t2
1390	mov	$ap_end,$t2		// $rp copy
1391	subs	$t0,$acc0,$m0
1392	add	$tp,sp,#8*8
1393	sbcs	$t1,$acc1,$m1
1394	sub	$cnt,$num,#8*4
1395
1396.Lmul4x_sub:
1397	sbcs	$t2,$acc2,$m2
1398	ldp	$m0,$m1,[$np,#8*0]
1399	sub	$cnt,$cnt,#8*4
1400	ldp	$acc0,$acc1,[$tp,#8*0]
1401	sbcs	$t3,$acc3,$m3
1402	ldp	$m2,$m3,[$np,#8*2]
1403	add	$np,$np,#8*4
1404	ldp	$acc2,$acc3,[$tp,#8*2]
1405	add	$tp,$tp,#8*4
1406	stp	$t0,$t1,[$rp,#8*0]
1407	sbcs	$t0,$acc0,$m0
1408	stp	$t2,$t3,[$rp,#8*2]
1409	add	$rp,$rp,#8*4
1410	sbcs	$t1,$acc1,$m1
1411	cbnz	$cnt,.Lmul4x_sub
1412
1413	sbcs	$t2,$acc2,$m2
1414	 mov	$tp,sp
1415	 add	$ap,sp,#8*4
1416	 ldp	$a0,$a1,[$ap_end,#8*0]
1417	sbcs	$t3,$acc3,$m3
1418	stp	$t0,$t1,[$rp,#8*0]
1419	 ldp	$a2,$a3,[$ap_end,#8*2]
1420	stp	$t2,$t3,[$rp,#8*2]
1421	 ldp	$acc0,$acc1,[$ap,#8*0]
1422	 ldp	$acc2,$acc3,[$ap,#8*2]
1423	sbcs	xzr,$topmost,xzr	// did it borrow?
1424	ldr	x30,[x29,#8]		// pull return address
1425
1426	sub	$cnt,$num,#8*4
1427.Lmul4x_cond_copy:
1428	sub	$cnt,$cnt,#8*4
1429	csel	$t0,$acc0,$a0,lo
1430	 stp	xzr,xzr,[$tp,#8*0]
1431	csel	$t1,$acc1,$a1,lo
1432	ldp	$a0,$a1,[$ap_end,#8*4]
1433	ldp	$acc0,$acc1,[$ap,#8*4]
1434	csel	$t2,$acc2,$a2,lo
1435	 stp	xzr,xzr,[$tp,#8*2]
1436	 add	$tp,$tp,#8*4
1437	csel	$t3,$acc3,$a3,lo
1438	ldp	$a2,$a3,[$ap_end,#8*6]
1439	ldp	$acc2,$acc3,[$ap,#8*6]
1440	add	$ap,$ap,#8*4
1441	stp	$t0,$t1,[$ap_end,#8*0]
1442	stp	$t2,$t3,[$ap_end,#8*2]
1443	add	$ap_end,$ap_end,#8*4
1444	cbnz	$cnt,.Lmul4x_cond_copy
1445
1446	csel	$t0,$acc0,$a0,lo
1447	 stp	xzr,xzr,[$tp,#8*0]
1448	csel	$t1,$acc1,$a1,lo
1449	 stp	xzr,xzr,[$tp,#8*2]
1450	csel	$t2,$acc2,$a2,lo
1451	 stp	xzr,xzr,[$tp,#8*3]
1452	csel	$t3,$acc3,$a3,lo
1453	 stp	xzr,xzr,[$tp,#8*4]
1454	stp	$t0,$t1,[$ap_end,#8*0]
1455	stp	$t2,$t3,[$ap_end,#8*2]
1456
1457	b	.Lmul4x_done
1458
1459.align	4
1460.Lmul4x4_post_condition:
1461	adc	$carry,$carry,xzr
1462	ldr	$ap,[x29,#96]		// pull rp
1463	// $acc0-3,$carry hold result, $m0-7 hold modulus
1464	subs	$a0,$acc0,$m0
1465	ldr	x30,[x29,#8]		// pull return address
1466	sbcs	$a1,$acc1,$m1
1467	 stp	xzr,xzr,[sp,#8*0]
1468	sbcs	$a2,$acc2,$m2
1469	 stp	xzr,xzr,[sp,#8*2]
1470	sbcs	$a3,$acc3,$m3
1471	 stp	xzr,xzr,[sp,#8*4]
1472	sbcs	xzr,$carry,xzr		// did it borrow?
1473	 stp	xzr,xzr,[sp,#8*6]
1474
1475	// $a0-3 hold result-modulus
1476	csel	$a0,$acc0,$a0,lo
1477	csel	$a1,$acc1,$a1,lo
1478	csel	$a2,$acc2,$a2,lo
1479	csel	$a3,$acc3,$a3,lo
1480	stp	$a0,$a1,[$ap,#8*0]
1481	stp	$a2,$a3,[$ap,#8*2]
1482
1483.Lmul4x_done:
1484	ldp	x19,x20,[x29,#16]
1485	mov	sp,x29
1486	ldp	x21,x22,[x29,#32]
1487	mov	x0,#1
1488	ldp	x23,x24,[x29,#48]
1489	ldp	x25,x26,[x29,#64]
1490	ldp	x27,x28,[x29,#80]
1491	ldr	x29,[sp],#128
1492	ret
1493.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1494___
1495}
1496$code.=<<___;
1497.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1498.align	4
1499___
1500
1501print $code;
1502
1503close STDOUT;
1504