1.ident	"sparcv8plus.s, Version 1.4"
2.ident	"SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 *	cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 *    and with gcc:
29 *
30 *	gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 *    or if above fails (it does if you have gas installed):
33 *
34 *	gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 *    Quick-n-dirty way to fuse the module into the library.
37 *    Provided that the library is already configured and built
38 *    (in 0.9.2 case with no-asm option):
39 *
40 *	# cd crypto/bn
41 *	# cp /some/place/bn_asm.sparc.v8plus.S .
42 *	# cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 *	# make
44 *	# cd ../..
45 *	# make; make test
46 *
47 *    Quick-n-dirty way to get rid of it:
48 *
49 *	# cd crypto/bn
50 *	# touch bn_asm.c
51 *	# make
52 *	# cd ../..
53 *	# make; make test
54 *
55 * Q. V8plus achitecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 *    special conditions, namely when kernel doesn't preserve upper
59 *    32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 *    version is provided along. Both version share bn_*comba[48]
64 *    implementations (see comment later in code for explanation).
65 *    But what's so special about this UltraSPARC implementation?
66 *    Why didn't I let compiler do the job? Trouble is that most of
67 *    available compilers (well, SC5.0 is the only exception) don't
68 *    attempt to take advantage of UltraSPARC's 64-bitness under
69 *    32-bit kernels even though it's perfectly possible (see next
70 *    question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 *    doesn't work?
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is
75 *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 *    preserved if you're in a leaf function, i.e. such never calling
77 *    any other functions. All functions in this module are leaf and
78 *    10 registers is a handful. And as a matter of fact none-"comba"
79 *    routines don't require even that much and I could even afford to
80 *    not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 *    under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 *    code position dependencies and it's safe to include it into
89 *    shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 *    experience with crypto/bn/expspeed.c test program:
94 *
95 *	v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 *	cc-5.0 -xarch=v8plus -xO5 -xdepend	+7-12%
98 *	cc-4.2 -xarch=v8plus -xO5 -xdepend	+25-35%
99 *	egcs-1.1.2 -mcpu=ultrasparc -O3		+35-45%
100 *
101 *	v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 *	cc-5.0 -xarch=v8 -xO5 -xdepend		+7-10%
104 *	cc-4.2 -xarch=v8 -xO5 -xdepend		+10%
105 *	egcs-1.1.2 -mv8 -O3			+35-45%
106 *
107 *    As you can see it's damn hard to beat the new Sun C compiler
108 *    and it's in first place GNU C users who will appreciate this
109 *    assembler implementation:-)
110 */
111
112/*
113 * Revision history.
114 *
115 * 1.0	- initial release;
116 * 1.1	- new loop unrolling model(*);
117 *	- some more fine tuning;
118 * 1.2	- made gas friendly;
119 *	- updates to documentation concerning v9;
120 *	- new performance comparison matrix;
121 * 1.3	- fixed problem with /usr/ccs/lib/cpp;
122 * 1.4	- native V9 bn_*_comba[48] implementation (15% more efficient)
123 *	  resulting in slight overall performance kick;
124 *	- some retunes;
125 *	- support for GNU as added;
126 *
127 * (*)	Originally unrolled loop looked like this:
128 *	    for (;;) {
129 *		op(p+0); if (--n==0) break;
130 *		op(p+1); if (--n==0) break;
131 *		op(p+2); if (--n==0) break;
132 *		op(p+3); if (--n==0) break;
133 *		p+=4;
134 *	    }
135 *	I unroll according to following:
136 *	    while (n&~3) {
137 *		op(p+0); op(p+1); op(p+2); op(p+3);
138 *		p+=4; n=-4;
139 *	    }
140 *	    if (n) {
141 *		op(p+0); if (--n==0) return;
142 *		op(p+2); if (--n==0) return;
143 *		op(p+3); return;
144 *	    }
145 */
146
147#if defined(__SUNPRO_C) && defined(__sparcv9)
148  /* They've said -xarch=v9 at command line */
149  .register	%g2,#scratch
150  .register	%g3,#scratch
151# define	FRAME_SIZE	-192
152#elif defined(__GNUC__) && defined(__arch64__)
153  /* They've said -m64 at command line */
154  .register	%g2,#scratch
155  .register	%g3,#scratch
156# define	FRAME_SIZE	-192
157#else
158# define	FRAME_SIZE	-96
159#endif
160/*
161 * GNU assembler can't stand stuw:-(
162 */
163#define stuw st
164
165.section	".text",#alloc,#execinstr
166.file		"bn_asm.sparc.v8plus.S"
167
168.align	32
169
170.global bn_mul_add_words
171/*
172 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
173 * BN_ULONG *rp,*ap;
174 * int num;
175 * BN_ULONG w;
176 */
177bn_mul_add_words:
178	sra	%o2,%g0,%o2	! signx %o2
179	brgz,a	%o2,.L_bn_mul_add_words_proceed
180	lduw	[%o1],%g2
181	retl
182	clr	%o0
183	nop
184	nop
185	nop
186
187.L_bn_mul_add_words_proceed:
188	srl	%o3,%g0,%o3	! clruw	%o3
189	andcc	%o2,-4,%g0
190	bz,pn	%icc,.L_bn_mul_add_words_tail
191	clr	%o5
192
193.L_bn_mul_add_words_loop:	! wow! 32 aligned!
194	lduw	[%o0],%g1
195	lduw	[%o1+4],%g3
196	mulx	%o3,%g2,%g2
197	add	%g1,%o5,%o4
198	nop
199	add	%o4,%g2,%o4
200	stuw	%o4,[%o0]
201	srlx	%o4,32,%o5
202
203	lduw	[%o0+4],%g1
204	lduw	[%o1+8],%g2
205	mulx	%o3,%g3,%g3
206	add	%g1,%o5,%o4
207	dec	4,%o2
208	add	%o4,%g3,%o4
209	stuw	%o4,[%o0+4]
210	srlx	%o4,32,%o5
211
212	lduw	[%o0+8],%g1
213	lduw	[%o1+12],%g3
214	mulx	%o3,%g2,%g2
215	add	%g1,%o5,%o4
216	inc	16,%o1
217	add	%o4,%g2,%o4
218	stuw	%o4,[%o0+8]
219	srlx	%o4,32,%o5
220
221	lduw	[%o0+12],%g1
222	mulx	%o3,%g3,%g3
223	add	%g1,%o5,%o4
224	inc	16,%o0
225	add	%o4,%g3,%o4
226	andcc	%o2,-4,%g0
227	stuw	%o4,[%o0-4]
228	srlx	%o4,32,%o5
229	bnz,a,pt	%icc,.L_bn_mul_add_words_loop
230	lduw	[%o1],%g2
231
232	brnz,a,pn	%o2,.L_bn_mul_add_words_tail
233	lduw	[%o1],%g2
234.L_bn_mul_add_words_return:
235	retl
236	mov	%o5,%o0
237
238.L_bn_mul_add_words_tail:
239	lduw	[%o0],%g1
240	mulx	%o3,%g2,%g2
241	add	%g1,%o5,%o4
242	dec	%o2
243	add	%o4,%g2,%o4
244	srlx	%o4,32,%o5
245	brz,pt	%o2,.L_bn_mul_add_words_return
246	stuw	%o4,[%o0]
247
248	lduw	[%o1+4],%g2
249	lduw	[%o0+4],%g1
250	mulx	%o3,%g2,%g2
251	add	%g1,%o5,%o4
252	dec	%o2
253	add	%o4,%g2,%o4
254	srlx	%o4,32,%o5
255	brz,pt	%o2,.L_bn_mul_add_words_return
256	stuw	%o4,[%o0+4]
257
258	lduw	[%o1+8],%g2
259	lduw	[%o0+8],%g1
260	mulx	%o3,%g2,%g2
261	add	%g1,%o5,%o4
262	add	%o4,%g2,%o4
263	stuw	%o4,[%o0+8]
264	retl
265	srlx	%o4,32,%o0
266
267.type	bn_mul_add_words,#function
268.size	bn_mul_add_words,(.-bn_mul_add_words)
269
270.align	32
271
272.global bn_mul_words
273/*
274 * BN_ULONG bn_mul_words(rp,ap,num,w)
275 * BN_ULONG *rp,*ap;
276 * int num;
277 * BN_ULONG w;
278 */
279bn_mul_words:
280	sra	%o2,%g0,%o2	! signx %o2
281	brgz,a	%o2,.L_bn_mul_words_proceeed
282	lduw	[%o1],%g2
283	retl
284	clr	%o0
285	nop
286	nop
287	nop
288
289.L_bn_mul_words_proceeed:
290	srl	%o3,%g0,%o3	! clruw	%o3
291	andcc	%o2,-4,%g0
292	bz,pn	%icc,.L_bn_mul_words_tail
293	clr	%o5
294
295.L_bn_mul_words_loop:		! wow! 32 aligned!
296	lduw	[%o1+4],%g3
297	mulx	%o3,%g2,%g2
298	add	%g2,%o5,%o4
299	nop
300	stuw	%o4,[%o0]
301	srlx	%o4,32,%o5
302
303	lduw	[%o1+8],%g2
304	mulx	%o3,%g3,%g3
305	add	%g3,%o5,%o4
306	dec	4,%o2
307	stuw	%o4,[%o0+4]
308	srlx	%o4,32,%o5
309
310	lduw	[%o1+12],%g3
311	mulx	%o3,%g2,%g2
312	add	%g2,%o5,%o4
313	inc	16,%o1
314	stuw	%o4,[%o0+8]
315	srlx	%o4,32,%o5
316
317	mulx	%o3,%g3,%g3
318	add	%g3,%o5,%o4
319	inc	16,%o0
320	stuw	%o4,[%o0-4]
321	srlx	%o4,32,%o5
322	andcc	%o2,-4,%g0
323	bnz,a,pt	%icc,.L_bn_mul_words_loop
324	lduw	[%o1],%g2
325	nop
326	nop
327
328	brnz,a,pn	%o2,.L_bn_mul_words_tail
329	lduw	[%o1],%g2
330.L_bn_mul_words_return:
331	retl
332	mov	%o5,%o0
333
334.L_bn_mul_words_tail:
335	mulx	%o3,%g2,%g2
336	add	%g2,%o5,%o4
337	dec	%o2
338	srlx	%o4,32,%o5
339	brz,pt	%o2,.L_bn_mul_words_return
340	stuw	%o4,[%o0]
341
342	lduw	[%o1+4],%g2
343	mulx	%o3,%g2,%g2
344	add	%g2,%o5,%o4
345	dec	%o2
346	srlx	%o4,32,%o5
347	brz,pt	%o2,.L_bn_mul_words_return
348	stuw	%o4,[%o0+4]
349
350	lduw	[%o1+8],%g2
351	mulx	%o3,%g2,%g2
352	add	%g2,%o5,%o4
353	stuw	%o4,[%o0+8]
354	retl
355	srlx	%o4,32,%o0
356
357.type	bn_mul_words,#function
358.size	bn_mul_words,(.-bn_mul_words)
359
360.align  32
361.global	bn_sqr_words
362/*
363 * void bn_sqr_words(r,a,n)
364 * BN_ULONG *r,*a;
365 * int n;
366 */
367bn_sqr_words:
368	sra	%o2,%g0,%o2	! signx %o2
369	brgz,a	%o2,.L_bn_sqr_words_proceeed
370	lduw	[%o1],%g2
371	retl
372	clr	%o0
373	nop
374	nop
375	nop
376
377.L_bn_sqr_words_proceeed:
378	andcc	%o2,-4,%g0
379	nop
380	bz,pn	%icc,.L_bn_sqr_words_tail
381	nop
382
383.L_bn_sqr_words_loop:		! wow! 32 aligned!
384	lduw	[%o1+4],%g3
385	mulx	%g2,%g2,%o4
386	stuw	%o4,[%o0]
387	srlx	%o4,32,%o5
388	stuw	%o5,[%o0+4]
389	nop
390
391	lduw	[%o1+8],%g2
392	mulx	%g3,%g3,%o4
393	dec	4,%o2
394	stuw	%o4,[%o0+8]
395	srlx	%o4,32,%o5
396	stuw	%o5,[%o0+12]
397
398	lduw	[%o1+12],%g3
399	mulx	%g2,%g2,%o4
400	srlx	%o4,32,%o5
401	stuw	%o4,[%o0+16]
402	inc	16,%o1
403	stuw	%o5,[%o0+20]
404
405	mulx	%g3,%g3,%o4
406	inc	32,%o0
407	stuw	%o4,[%o0-8]
408	srlx	%o4,32,%o5
409	andcc	%o2,-4,%g2
410	stuw	%o5,[%o0-4]
411	bnz,a,pt	%icc,.L_bn_sqr_words_loop
412	lduw	[%o1],%g2
413	nop
414
415	brnz,a,pn	%o2,.L_bn_sqr_words_tail
416	lduw	[%o1],%g2
417.L_bn_sqr_words_return:
418	retl
419	clr	%o0
420
421.L_bn_sqr_words_tail:
422	mulx	%g2,%g2,%o4
423	dec	%o2
424	stuw	%o4,[%o0]
425	srlx	%o4,32,%o5
426	brz,pt	%o2,.L_bn_sqr_words_return
427	stuw	%o5,[%o0+4]
428
429	lduw	[%o1+4],%g2
430	mulx	%g2,%g2,%o4
431	dec	%o2
432	stuw	%o4,[%o0+8]
433	srlx	%o4,32,%o5
434	brz,pt	%o2,.L_bn_sqr_words_return
435	stuw	%o5,[%o0+12]
436
437	lduw	[%o1+8],%g2
438	mulx	%g2,%g2,%o4
439	srlx	%o4,32,%o5
440	stuw	%o4,[%o0+16]
441	stuw	%o5,[%o0+20]
442	retl
443	clr	%o0
444
445.type	bn_sqr_words,#function
446.size	bn_sqr_words,(.-bn_sqr_words)
447
448.align	32
449.global bn_div_words
450/*
451 * BN_ULONG bn_div_words(h,l,d)
452 * BN_ULONG h,l,d;
453 */
454bn_div_words:
455	sllx	%o0,32,%o0
456	or	%o0,%o1,%o0
457	udivx	%o0,%o2,%o0
458	retl
459	srl	%o0,%g0,%o0	! clruw	%o0
460
461.type	bn_div_words,#function
462.size	bn_div_words,(.-bn_div_words)
463
464.align	32
465
466.global bn_add_words
467/*
468 * BN_ULONG bn_add_words(rp,ap,bp,n)
469 * BN_ULONG *rp,*ap,*bp;
470 * int n;
471 */
472bn_add_words:
473	sra	%o3,%g0,%o3	! signx %o3
474	brgz,a	%o3,.L_bn_add_words_proceed
475	lduw	[%o1],%o4
476	retl
477	clr	%o0
478
479.L_bn_add_words_proceed:
480	andcc	%o3,-4,%g0
481	bz,pn	%icc,.L_bn_add_words_tail
482	addcc	%g0,0,%g0	! clear carry flag
483
484.L_bn_add_words_loop:		! wow! 32 aligned!
485	dec	4,%o3
486	lduw	[%o2],%o5
487	lduw	[%o1+4],%g1
488	lduw	[%o2+4],%g2
489	lduw	[%o1+8],%g3
490	lduw	[%o2+8],%g4
491	addccc	%o5,%o4,%o5
492	stuw	%o5,[%o0]
493
494	lduw	[%o1+12],%o4
495	lduw	[%o2+12],%o5
496	inc	16,%o1
497	addccc	%g1,%g2,%g1
498	stuw	%g1,[%o0+4]
499
500	inc	16,%o2
501	addccc	%g3,%g4,%g3
502	stuw	%g3,[%o0+8]
503
504	inc	16,%o0
505	addccc	%o5,%o4,%o5
506	stuw	%o5,[%o0-4]
507	and	%o3,-4,%g1
508	brnz,a,pt	%g1,.L_bn_add_words_loop
509	lduw	[%o1],%o4
510
511	brnz,a,pn	%o3,.L_bn_add_words_tail
512	lduw	[%o1],%o4
513.L_bn_add_words_return:
514	clr	%o0
515	retl
516	movcs	%icc,1,%o0
517	nop
518
519.L_bn_add_words_tail:
520	lduw	[%o2],%o5
521	dec	%o3
522	addccc	%o5,%o4,%o5
523	brz,pt	%o3,.L_bn_add_words_return
524	stuw	%o5,[%o0]
525
526	lduw	[%o1+4],%o4
527	lduw	[%o2+4],%o5
528	dec	%o3
529	addccc	%o5,%o4,%o5
530	brz,pt	%o3,.L_bn_add_words_return
531	stuw	%o5,[%o0+4]
532
533	lduw	[%o1+8],%o4
534	lduw	[%o2+8],%o5
535	addccc	%o5,%o4,%o5
536	stuw	%o5,[%o0+8]
537	clr	%o0
538	retl
539	movcs	%icc,1,%o0
540
541.type	bn_add_words,#function
542.size	bn_add_words,(.-bn_add_words)
543
544.global bn_sub_words
545/*
546 * BN_ULONG bn_sub_words(rp,ap,bp,n)
547 * BN_ULONG *rp,*ap,*bp;
548 * int n;
549 */
550bn_sub_words:
551	sra	%o3,%g0,%o3	! signx %o3
552	brgz,a	%o3,.L_bn_sub_words_proceed
553	lduw	[%o1],%o4
554	retl
555	clr	%o0
556
557.L_bn_sub_words_proceed:
558	andcc	%o3,-4,%g0
559	bz,pn	%icc,.L_bn_sub_words_tail
560	addcc	%g0,0,%g0	! clear carry flag
561
562.L_bn_sub_words_loop:		! wow! 32 aligned!
563	dec	4,%o3
564	lduw	[%o2],%o5
565	lduw	[%o1+4],%g1
566	lduw	[%o2+4],%g2
567	lduw	[%o1+8],%g3
568	lduw	[%o2+8],%g4
569	subccc	%o4,%o5,%o5
570	stuw	%o5,[%o0]
571
572	lduw	[%o1+12],%o4
573	lduw	[%o2+12],%o5
574	inc	16,%o1
575	subccc	%g1,%g2,%g2
576	stuw	%g2,[%o0+4]
577
578	inc	16,%o2
579	subccc	%g3,%g4,%g4
580	stuw	%g4,[%o0+8]
581
582	inc	16,%o0
583	subccc	%o4,%o5,%o5
584	stuw	%o5,[%o0-4]
585	and	%o3,-4,%g1
586	brnz,a,pt	%g1,.L_bn_sub_words_loop
587	lduw	[%o1],%o4
588
589	brnz,a,pn	%o3,.L_bn_sub_words_tail
590	lduw	[%o1],%o4
591.L_bn_sub_words_return:
592	clr	%o0
593	retl
594	movcs	%icc,1,%o0
595	nop
596
597.L_bn_sub_words_tail:		! wow! 32 aligned!
598	lduw	[%o2],%o5
599	dec	%o3
600	subccc	%o4,%o5,%o5
601	brz,pt	%o3,.L_bn_sub_words_return
602	stuw	%o5,[%o0]
603
604	lduw	[%o1+4],%o4
605	lduw	[%o2+4],%o5
606	dec	%o3
607	subccc	%o4,%o5,%o5
608	brz,pt	%o3,.L_bn_sub_words_return
609	stuw	%o5,[%o0+4]
610
611	lduw	[%o1+8],%o4
612	lduw	[%o2+8],%o5
613	subccc	%o4,%o5,%o5
614	stuw	%o5,[%o0+8]
615	clr	%o0
616	retl
617	movcs	%icc,1,%o0
618
619.type	bn_sub_words,#function
620.size	bn_sub_words,(.-bn_sub_words)
621
622/*
623 * Code below depends on the fact that upper parts of the %l0-%l7
624 * and %i0-%i7 are zeroed by kernel after context switch. In
625 * previous versions this comment stated that "the trouble is that
626 * it's not feasible to implement the mumbo-jumbo in less V9
627 * instructions:-(" which apparently isn't true thanks to
628 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
629 * results not from the shorter code, but from elimination of
630 * multicycle none-pairable 'rd %y,%rd' instructions.
631 *
632 *							Andy.
633 */
634
635/*
636 * Here is register usage map for *all* routines below.
637 */
638#define t_1	%o0
639#define	t_2	%o1
640#define c_12	%o2
641#define c_3	%o3
642
643#define ap(I)	[%i1+4*I]
644#define bp(I)	[%i2+4*I]
645#define rp(I)	[%i0+4*I]
646
647#define	a_0	%l0
648#define	a_1	%l1
649#define	a_2	%l2
650#define	a_3	%l3
651#define	a_4	%l4
652#define	a_5	%l5
653#define	a_6	%l6
654#define	a_7	%l7
655
656#define	b_0	%i3
657#define	b_1	%i4
658#define	b_2	%i5
659#define	b_3	%o4
660#define	b_4	%o5
661#define	b_5	%o7
662#define	b_6	%g1
663#define	b_7	%g4
664
665.align	32
666.global bn_mul_comba8
667/*
668 * void bn_mul_comba8(r,a,b)
669 * BN_ULONG *r,*a,*b;
670 */
671bn_mul_comba8:
672	save	%sp,FRAME_SIZE,%sp
673	mov	1,t_2
674	lduw	ap(0),a_0
675	sllx	t_2,32,t_2
676	lduw	bp(0),b_0	!=
677	lduw	bp(1),b_1
678	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
679	srlx	t_1,32,c_12
680	stuw	t_1,rp(0)	!=!r[0]=c1;
681
682	lduw	ap(1),a_1
683	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
684	addcc	c_12,t_1,c_12
685	clr	c_3		!=
686	bcs,a	%xcc,.+8
687	add	c_3,t_2,c_3
688	lduw	ap(2),a_2
689	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
690	addcc	c_12,t_1,t_1
691	bcs,a	%xcc,.+8
692	add	c_3,t_2,c_3
693	srlx	t_1,32,c_12	!=
694	stuw	t_1,rp(1)	!r[1]=c2;
695	or	c_12,c_3,c_12
696
697	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
698	addcc	c_12,t_1,c_12	!=
699	clr	c_3
700	bcs,a	%xcc,.+8
701	add	c_3,t_2,c_3
702	lduw	bp(2),b_2	!=
703	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
704	addcc	c_12,t_1,c_12
705	bcs,a	%xcc,.+8
706	add	c_3,t_2,c_3	!=
707	lduw	bp(3),b_3
708	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
709	addcc	c_12,t_1,t_1
710	bcs,a	%xcc,.+8	!=
711	add	c_3,t_2,c_3
712	srlx	t_1,32,c_12
713	stuw	t_1,rp(2)	!r[2]=c3;
714	or	c_12,c_3,c_12	!=
715
716	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
717	addcc	c_12,t_1,c_12
718	clr	c_3
719	bcs,a	%xcc,.+8	!=
720	add	c_3,t_2,c_3
721	mulx	a_1,b_2,t_1	!=!mul_add_c(a[1],b[2],c1,c2,c3);
722	addcc	c_12,t_1,c_12
723	bcs,a	%xcc,.+8	!=
724	add	c_3,t_2,c_3
725	lduw	ap(3),a_3
726	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
727	addcc	c_12,t_1,c_12	!=
728	bcs,a	%xcc,.+8
729	add	c_3,t_2,c_3
730	lduw	ap(4),a_4
731	mulx	a_3,b_0,t_1	!=!mul_add_c(a[3],b[0],c1,c2,c3);!=
732	addcc	c_12,t_1,t_1
733	bcs,a	%xcc,.+8
734	add	c_3,t_2,c_3
735	srlx	t_1,32,c_12	!=
736	stuw	t_1,rp(3)	!r[3]=c1;
737	or	c_12,c_3,c_12
738
739	mulx	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
740	addcc	c_12,t_1,c_12	!=
741	clr	c_3
742	bcs,a	%xcc,.+8
743	add	c_3,t_2,c_3
744	mulx	a_3,b_1,t_1	!=!mul_add_c(a[3],b[1],c2,c3,c1);
745	addcc	c_12,t_1,c_12
746	bcs,a	%xcc,.+8
747	add	c_3,t_2,c_3
748	mulx	a_2,b_2,t_1	!=!mul_add_c(a[2],b[2],c2,c3,c1);
749	addcc	c_12,t_1,c_12
750	bcs,a	%xcc,.+8
751	add	c_3,t_2,c_3
752	lduw	bp(4),b_4	!=
753	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
754	addcc	c_12,t_1,c_12
755	bcs,a	%xcc,.+8
756	add	c_3,t_2,c_3	!=
757	lduw	bp(5),b_5
758	mulx	a_0,b_4,t_1	!mul_add_c(a[0],b[4],c2,c3,c1);
759	addcc	c_12,t_1,t_1
760	bcs,a	%xcc,.+8	!=
761	add	c_3,t_2,c_3
762	srlx	t_1,32,c_12
763	stuw	t_1,rp(4)	!r[4]=c2;
764	or	c_12,c_3,c_12	!=
765
766	mulx	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
767	addcc	c_12,t_1,c_12
768	clr	c_3
769	bcs,a	%xcc,.+8	!=
770	add	c_3,t_2,c_3
771	mulx	a_1,b_4,t_1	!mul_add_c(a[1],b[4],c3,c1,c2);
772	addcc	c_12,t_1,c_12
773	bcs,a	%xcc,.+8	!=
774	add	c_3,t_2,c_3
775	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
776	addcc	c_12,t_1,c_12
777	bcs,a	%xcc,.+8	!=
778	add	c_3,t_2,c_3
779	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
780	addcc	c_12,t_1,c_12
781	bcs,a	%xcc,.+8	!=
782	add	c_3,t_2,c_3
783	lduw	ap(5),a_5
784	mulx	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
785	addcc	c_12,t_1,c_12	!=
786	bcs,a	%xcc,.+8
787	add	c_3,t_2,c_3
788	lduw	ap(6),a_6
789	mulx	a_5,b_0,t_1	!=!mul_add_c(a[5],b[0],c3,c1,c2);
790	addcc	c_12,t_1,t_1
791	bcs,a	%xcc,.+8
792	add	c_3,t_2,c_3
793	srlx	t_1,32,c_12	!=
794	stuw	t_1,rp(5)	!r[5]=c3;
795	or	c_12,c_3,c_12
796
797	mulx	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
798	addcc	c_12,t_1,c_12	!=
799	clr	c_3
800	bcs,a	%xcc,.+8
801	add	c_3,t_2,c_3
802	mulx	a_5,b_1,t_1	!=!mul_add_c(a[5],b[1],c1,c2,c3);
803	addcc	c_12,t_1,c_12
804	bcs,a	%xcc,.+8
805	add	c_3,t_2,c_3
806	mulx	a_4,b_2,t_1	!=!mul_add_c(a[4],b[2],c1,c2,c3);
807	addcc	c_12,t_1,c_12
808	bcs,a	%xcc,.+8
809	add	c_3,t_2,c_3
810	mulx	a_3,b_3,t_1	!=!mul_add_c(a[3],b[3],c1,c2,c3);
811	addcc	c_12,t_1,c_12
812	bcs,a	%xcc,.+8
813	add	c_3,t_2,c_3
814	mulx	a_2,b_4,t_1	!=!mul_add_c(a[2],b[4],c1,c2,c3);
815	addcc	c_12,t_1,c_12
816	bcs,a	%xcc,.+8
817	add	c_3,t_2,c_3
818	lduw	bp(6),b_6	!=
819	mulx	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
820	addcc	c_12,t_1,c_12
821	bcs,a	%xcc,.+8
822	add	c_3,t_2,c_3	!=
823	lduw	bp(7),b_7
824	mulx	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
825	addcc	c_12,t_1,t_1
826	bcs,a	%xcc,.+8	!=
827	add	c_3,t_2,c_3
828	srlx	t_1,32,c_12
829	stuw	t_1,rp(6)	!r[6]=c1;
830	or	c_12,c_3,c_12	!=
831
832	mulx	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
833	addcc	c_12,t_1,c_12
834	clr	c_3
835	bcs,a	%xcc,.+8	!=
836	add	c_3,t_2,c_3
837	mulx	a_1,b_6,t_1	!mul_add_c(a[1],b[6],c2,c3,c1);
838	addcc	c_12,t_1,c_12
839	bcs,a	%xcc,.+8	!=
840	add	c_3,t_2,c_3
841	mulx	a_2,b_5,t_1	!mul_add_c(a[2],b[5],c2,c3,c1);
842	addcc	c_12,t_1,c_12
843	bcs,a	%xcc,.+8	!=
844	add	c_3,t_2,c_3
845	mulx	a_3,b_4,t_1	!mul_add_c(a[3],b[4],c2,c3,c1);
846	addcc	c_12,t_1,c_12
847	bcs,a	%xcc,.+8	!=
848	add	c_3,t_2,c_3
849	mulx	a_4,b_3,t_1	!mul_add_c(a[4],b[3],c2,c3,c1);
850	addcc	c_12,t_1,c_12
851	bcs,a	%xcc,.+8	!=
852	add	c_3,t_2,c_3
853	mulx	a_5,b_2,t_1	!mul_add_c(a[5],b[2],c2,c3,c1);
854	addcc	c_12,t_1,c_12
855	bcs,a	%xcc,.+8	!=
856	add	c_3,t_2,c_3
857	lduw	ap(7),a_7
858	mulx	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
859	addcc	c_12,t_1,c_12
860	bcs,a	%xcc,.+8
861	add	c_3,t_2,c_3
862	mulx	a_7,b_0,t_1	!=!mul_add_c(a[7],b[0],c2,c3,c1);
863	addcc	c_12,t_1,t_1
864	bcs,a	%xcc,.+8
865	add	c_3,t_2,c_3
866	srlx	t_1,32,c_12	!=
867	stuw	t_1,rp(7)	!r[7]=c2;
868	or	c_12,c_3,c_12
869
870	mulx	a_7,b_1,t_1	!=!mul_add_c(a[7],b[1],c3,c1,c2);
871	addcc	c_12,t_1,c_12
872	clr	c_3
873	bcs,a	%xcc,.+8
874	add	c_3,t_2,c_3	!=
875	mulx	a_6,b_2,t_1	!mul_add_c(a[6],b[2],c3,c1,c2);
876	addcc	c_12,t_1,c_12
877	bcs,a	%xcc,.+8
878	add	c_3,t_2,c_3	!=
879	mulx	a_5,b_3,t_1	!mul_add_c(a[5],b[3],c3,c1,c2);
880	addcc	c_12,t_1,c_12
881	bcs,a	%xcc,.+8
882	add	c_3,t_2,c_3	!=
883	mulx	a_4,b_4,t_1	!mul_add_c(a[4],b[4],c3,c1,c2);
884	addcc	c_12,t_1,c_12
885	bcs,a	%xcc,.+8
886	add	c_3,t_2,c_3	!=
887	mulx	a_3,b_5,t_1	!mul_add_c(a[3],b[5],c3,c1,c2);
888	addcc	c_12,t_1,c_12
889	bcs,a	%xcc,.+8
890	add	c_3,t_2,c_3	!=
891	mulx	a_2,b_6,t_1	!mul_add_c(a[2],b[6],c3,c1,c2);
892	addcc	c_12,t_1,c_12
893	bcs,a	%xcc,.+8
894	add	c_3,t_2,c_3	!=
895	mulx	a_1,b_7,t_1	!mul_add_c(a[1],b[7],c3,c1,c2);
896	addcc	c_12,t_1,t_1
897	bcs,a	%xcc,.+8
898	add	c_3,t_2,c_3	!=
899	srlx	t_1,32,c_12
900	stuw	t_1,rp(8)	!r[8]=c3;
901	or	c_12,c_3,c_12
902
903	mulx	a_2,b_7,t_1	!=!mul_add_c(a[2],b[7],c1,c2,c3);
904	addcc	c_12,t_1,c_12
905	clr	c_3
906	bcs,a	%xcc,.+8
907	add	c_3,t_2,c_3	!=
908	mulx	a_3,b_6,t_1	!mul_add_c(a[3],b[6],c1,c2,c3);
909	addcc	c_12,t_1,c_12
910	bcs,a	%xcc,.+8	!=
911	add	c_3,t_2,c_3
912	mulx	a_4,b_5,t_1	!mul_add_c(a[4],b[5],c1,c2,c3);
913	addcc	c_12,t_1,c_12
914	bcs,a	%xcc,.+8	!=
915	add	c_3,t_2,c_3
916	mulx	a_5,b_4,t_1	!mul_add_c(a[5],b[4],c1,c2,c3);
917	addcc	c_12,t_1,c_12
918	bcs,a	%xcc,.+8	!=
919	add	c_3,t_2,c_3
920	mulx	a_6,b_3,t_1	!mul_add_c(a[6],b[3],c1,c2,c3);
921	addcc	c_12,t_1,c_12
922	bcs,a	%xcc,.+8	!=
923	add	c_3,t_2,c_3
924	mulx	a_7,b_2,t_1	!mul_add_c(a[7],b[2],c1,c2,c3);
925	addcc	c_12,t_1,t_1
926	bcs,a	%xcc,.+8	!=
927	add	c_3,t_2,c_3
928	srlx	t_1,32,c_12
929	stuw	t_1,rp(9)	!r[9]=c1;
930	or	c_12,c_3,c_12	!=
931
932	mulx	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
933	addcc	c_12,t_1,c_12
934	clr	c_3
935	bcs,a	%xcc,.+8	!=
936	add	c_3,t_2,c_3
937	mulx	a_6,b_4,t_1	!mul_add_c(a[6],b[4],c2,c3,c1);
938	addcc	c_12,t_1,c_12
939	bcs,a	%xcc,.+8	!=
940	add	c_3,t_2,c_3
941	mulx	a_5,b_5,t_1	!mul_add_c(a[5],b[5],c2,c3,c1);
942	addcc	c_12,t_1,c_12
943	bcs,a	%xcc,.+8	!=
944	add	c_3,t_2,c_3
945	mulx	a_4,b_6,t_1	!mul_add_c(a[4],b[6],c2,c3,c1);
946	addcc	c_12,t_1,c_12
947	bcs,a	%xcc,.+8	!=
948	add	c_3,t_2,c_3
949	mulx	a_3,b_7,t_1	!mul_add_c(a[3],b[7],c2,c3,c1);
950	addcc	c_12,t_1,t_1
951	bcs,a	%xcc,.+8	!=
952	add	c_3,t_2,c_3
953	srlx	t_1,32,c_12
954	stuw	t_1,rp(10)	!r[10]=c2;
955	or	c_12,c_3,c_12	!=
956
957	mulx	a_4,b_7,t_1	!mul_add_c(a[4],b[7],c3,c1,c2);
958	addcc	c_12,t_1,c_12
959	clr	c_3
960	bcs,a	%xcc,.+8	!=
961	add	c_3,t_2,c_3
962	mulx	a_5,b_6,t_1	!mul_add_c(a[5],b[6],c3,c1,c2);
963	addcc	c_12,t_1,c_12
964	bcs,a	%xcc,.+8	!=
965	add	c_3,t_2,c_3
966	mulx	a_6,b_5,t_1	!mul_add_c(a[6],b[5],c3,c1,c2);
967	addcc	c_12,t_1,c_12
968	bcs,a	%xcc,.+8	!=
969	add	c_3,t_2,c_3
970	mulx	a_7,b_4,t_1	!mul_add_c(a[7],b[4],c3,c1,c2);
971	addcc	c_12,t_1,t_1
972	bcs,a	%xcc,.+8	!=
973	add	c_3,t_2,c_3
974	srlx	t_1,32,c_12
975	stuw	t_1,rp(11)	!r[11]=c3;
976	or	c_12,c_3,c_12	!=
977
978	mulx	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
979	addcc	c_12,t_1,c_12
980	clr	c_3
981	bcs,a	%xcc,.+8	!=
982	add	c_3,t_2,c_3
983	mulx	a_6,b_6,t_1	!mul_add_c(a[6],b[6],c1,c2,c3);
984	addcc	c_12,t_1,c_12
985	bcs,a	%xcc,.+8	!=
986	add	c_3,t_2,c_3
987	mulx	a_5,b_7,t_1	!mul_add_c(a[5],b[7],c1,c2,c3);
988	addcc	c_12,t_1,t_1
989	bcs,a	%xcc,.+8	!=
990	add	c_3,t_2,c_3
991	srlx	t_1,32,c_12
992	stuw	t_1,rp(12)	!r[12]=c1;
993	or	c_12,c_3,c_12	!=
994
995	mulx	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
996	addcc	c_12,t_1,c_12
997	clr	c_3
998	bcs,a	%xcc,.+8	!=
999	add	c_3,t_2,c_3
1000	mulx	a_7,b_6,t_1	!mul_add_c(a[7],b[6],c2,c3,c1);
1001	addcc	c_12,t_1,t_1
1002	bcs,a	%xcc,.+8	!=
1003	add	c_3,t_2,c_3
1004	srlx	t_1,32,c_12
1005	st	t_1,rp(13)	!r[13]=c2;
1006	or	c_12,c_3,c_12	!=
1007
1008	mulx	a_7,b_7,t_1	!mul_add_c(a[7],b[7],c3,c1,c2);
1009	addcc	c_12,t_1,t_1
1010	srlx	t_1,32,c_12	!=
1011	stuw	t_1,rp(14)	!r[14]=c3;
1012	stuw	c_12,rp(15)	!r[15]=c1;
1013
1014	ret
1015	restore	%g0,%g0,%o0	!=
1016
1017.type	bn_mul_comba8,#function
1018.size	bn_mul_comba8,(.-bn_mul_comba8)
1019
1020.align	32
1021
1022.global bn_mul_comba4
1023/*
1024 * void bn_mul_comba4(r,a,b)
1025 * BN_ULONG *r,*a,*b;
1026 */
1027bn_mul_comba4:
1028	save	%sp,FRAME_SIZE,%sp
1029	lduw	ap(0),a_0
1030	mov	1,t_2
1031	lduw	bp(0),b_0
1032	sllx	t_2,32,t_2	!=
1033	lduw	bp(1),b_1
1034	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
1035	srlx	t_1,32,c_12
1036	stuw	t_1,rp(0)	!=!r[0]=c1;
1037
1038	lduw	ap(1),a_1
1039	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
1040	addcc	c_12,t_1,c_12
1041	clr	c_3		!=
1042	bcs,a	%xcc,.+8
1043	add	c_3,t_2,c_3
1044	lduw	ap(2),a_2
1045	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
1046	addcc	c_12,t_1,t_1
1047	bcs,a	%xcc,.+8
1048	add	c_3,t_2,c_3
1049	srlx	t_1,32,c_12	!=
1050	stuw	t_1,rp(1)	!r[1]=c2;
1051	or	c_12,c_3,c_12
1052
1053	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
1054	addcc	c_12,t_1,c_12	!=
1055	clr	c_3
1056	bcs,a	%xcc,.+8
1057	add	c_3,t_2,c_3
1058	lduw	bp(2),b_2	!=
1059	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
1060	addcc	c_12,t_1,c_12
1061	bcs,a	%xcc,.+8
1062	add	c_3,t_2,c_3	!=
1063	lduw	bp(3),b_3
1064	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
1065	addcc	c_12,t_1,t_1
1066	bcs,a	%xcc,.+8	!=
1067	add	c_3,t_2,c_3
1068	srlx	t_1,32,c_12
1069	stuw	t_1,rp(2)	!r[2]=c3;
1070	or	c_12,c_3,c_12	!=
1071
1072	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
1073	addcc	c_12,t_1,c_12
1074	clr	c_3
1075	bcs,a	%xcc,.+8	!=
1076	add	c_3,t_2,c_3
1077	mulx	a_1,b_2,t_1	!mul_add_c(a[1],b[2],c1,c2,c3);
1078	addcc	c_12,t_1,c_12
1079	bcs,a	%xcc,.+8	!=
1080	add	c_3,t_2,c_3
1081	lduw	ap(3),a_3
1082	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
1083	addcc	c_12,t_1,c_12	!=
1084	bcs,a	%xcc,.+8
1085	add	c_3,t_2,c_3
1086	mulx	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
1087	addcc	c_12,t_1,t_1	!=
1088	bcs,a	%xcc,.+8
1089	add	c_3,t_2,c_3
1090	srlx	t_1,32,c_12
1091	stuw	t_1,rp(3)	!=!r[3]=c1;
1092	or	c_12,c_3,c_12
1093
1094	mulx	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
1095	addcc	c_12,t_1,c_12
1096	clr	c_3		!=
1097	bcs,a	%xcc,.+8
1098	add	c_3,t_2,c_3
1099	mulx	a_2,b_2,t_1	!mul_add_c(a[2],b[2],c2,c3,c1);
1100	addcc	c_12,t_1,c_12	!=
1101	bcs,a	%xcc,.+8
1102	add	c_3,t_2,c_3
1103	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
1104	addcc	c_12,t_1,t_1	!=
1105	bcs,a	%xcc,.+8
1106	add	c_3,t_2,c_3
1107	srlx	t_1,32,c_12
1108	stuw	t_1,rp(4)	!=!r[4]=c2;
1109	or	c_12,c_3,c_12
1110
1111	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
1112	addcc	c_12,t_1,c_12
1113	clr	c_3		!=
1114	bcs,a	%xcc,.+8
1115	add	c_3,t_2,c_3
1116	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
1117	addcc	c_12,t_1,t_1	!=
1118	bcs,a	%xcc,.+8
1119	add	c_3,t_2,c_3
1120	srlx	t_1,32,c_12
1121	stuw	t_1,rp(5)	!=!r[5]=c3;
1122	or	c_12,c_3,c_12
1123
1124	mulx	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
1125	addcc	c_12,t_1,t_1
1126	srlx	t_1,32,c_12	!=
1127	stuw	t_1,rp(6)	!r[6]=c1;
1128	stuw	c_12,rp(7)	!r[7]=c2;
1129
1130	ret
1131	restore	%g0,%g0,%o0
1132
1133.type	bn_mul_comba4,#function
1134.size	bn_mul_comba4,(.-bn_mul_comba4)
1135
1136.align	32
1137
1138.global bn_sqr_comba8
1139bn_sqr_comba8:
1140	save	%sp,FRAME_SIZE,%sp
1141	mov	1,t_2
1142	lduw	ap(0),a_0
1143	sllx	t_2,32,t_2
1144	lduw	ap(1),a_1
1145	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
1146	srlx	t_1,32,c_12
1147	stuw	t_1,rp(0)	!r[0]=c1;
1148
1149	lduw	ap(2),a_2
1150	mulx	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
1151	addcc	c_12,t_1,c_12
1152	clr	c_3
1153	bcs,a	%xcc,.+8
1154	add	c_3,t_2,c_3
1155	addcc	c_12,t_1,t_1
1156	bcs,a	%xcc,.+8
1157	add	c_3,t_2,c_3
1158	srlx	t_1,32,c_12
1159	stuw	t_1,rp(1)	!r[1]=c2;
1160	or	c_12,c_3,c_12
1161
1162	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
1163	addcc	c_12,t_1,c_12
1164	clr	c_3
1165	bcs,a	%xcc,.+8
1166	add	c_3,t_2,c_3
1167	addcc	c_12,t_1,c_12
1168	bcs,a	%xcc,.+8
1169	add	c_3,t_2,c_3
1170	lduw	ap(3),a_3
1171	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
1172	addcc	c_12,t_1,t_1
1173	bcs,a	%xcc,.+8
1174	add	c_3,t_2,c_3
1175	srlx	t_1,32,c_12
1176	stuw	t_1,rp(2)	!r[2]=c3;
1177	or	c_12,c_3,c_12
1178
1179	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
1180	addcc	c_12,t_1,c_12
1181	clr	c_3
1182	bcs,a	%xcc,.+8
1183	add	c_3,t_2,c_3
1184	addcc	c_12,t_1,c_12
1185	bcs,a	%xcc,.+8
1186	add	c_3,t_2,c_3
1187	lduw	ap(4),a_4
1188	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
1189	addcc	c_12,t_1,c_12
1190	bcs,a	%xcc,.+8
1191	add	c_3,t_2,c_3
1192	addcc	c_12,t_1,t_1
1193	bcs,a	%xcc,.+8
1194	add	c_3,t_2,c_3
1195	srlx	t_1,32,c_12
1196	st	t_1,rp(3)	!r[3]=c1;
1197	or	c_12,c_3,c_12
1198
1199	mulx	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
1200	addcc	c_12,t_1,c_12
1201	clr	c_3
1202	bcs,a	%xcc,.+8
1203	add	c_3,t_2,c_3
1204	addcc	c_12,t_1,c_12
1205	bcs,a	%xcc,.+8
1206	add	c_3,t_2,c_3
1207	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
1208	addcc	c_12,t_1,c_12
1209	bcs,a	%xcc,.+8
1210	add	c_3,t_2,c_3
1211	addcc	c_12,t_1,c_12
1212	bcs,a	%xcc,.+8
1213	add	c_3,t_2,c_3
1214	lduw	ap(5),a_5
1215	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
1216	addcc	c_12,t_1,t_1
1217	bcs,a	%xcc,.+8
1218	add	c_3,t_2,c_3
1219	srlx	t_1,32,c_12
1220	stuw	t_1,rp(4)	!r[4]=c2;
1221	or	c_12,c_3,c_12
1222
1223	mulx	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
1224	addcc	c_12,t_1,c_12
1225	clr	c_3
1226	bcs,a	%xcc,.+8
1227	add	c_3,t_2,c_3
1228	addcc	c_12,t_1,c_12
1229	bcs,a	%xcc,.+8
1230	add	c_3,t_2,c_3
1231	mulx	a_1,a_4,t_1	!sqr_add_c2(a,4,1,c3,c1,c2);
1232	addcc	c_12,t_1,c_12
1233	bcs,a	%xcc,.+8
1234	add	c_3,t_2,c_3
1235	addcc	c_12,t_1,c_12
1236	bcs,a	%xcc,.+8
1237	add	c_3,t_2,c_3
1238	lduw	ap(6),a_6
1239	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
1240	addcc	c_12,t_1,c_12
1241	bcs,a	%xcc,.+8
1242	add	c_3,t_2,c_3
1243	addcc	c_12,t_1,t_1
1244	bcs,a	%xcc,.+8
1245	add	c_3,t_2,c_3
1246	srlx	t_1,32,c_12
1247	stuw	t_1,rp(5)	!r[5]=c3;
1248	or	c_12,c_3,c_12
1249
1250	mulx	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
1251	addcc	c_12,t_1,c_12
1252	clr	c_3
1253	bcs,a	%xcc,.+8
1254	add	c_3,t_2,c_3
1255	addcc	c_12,t_1,c_12
1256	bcs,a	%xcc,.+8
1257	add	c_3,t_2,c_3
1258	mulx	a_5,a_1,t_1	!sqr_add_c2(a,5,1,c1,c2,c3);
1259	addcc	c_12,t_1,c_12
1260	bcs,a	%xcc,.+8
1261	add	c_3,t_2,c_3
1262	addcc	c_12,t_1,c_12
1263	bcs,a	%xcc,.+8
1264	add	c_3,t_2,c_3
1265	mulx	a_4,a_2,t_1	!sqr_add_c2(a,4,2,c1,c2,c3);
1266	addcc	c_12,t_1,c_12
1267	bcs,a	%xcc,.+8
1268	add	c_3,t_2,c_3
1269	addcc	c_12,t_1,c_12
1270	bcs,a	%xcc,.+8
1271	add	c_3,t_2,c_3
1272	lduw	ap(7),a_7
1273	mulx	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
1274	addcc	c_12,t_1,t_1
1275	bcs,a	%xcc,.+8
1276	add	c_3,t_2,c_3
1277	srlx	t_1,32,c_12
1278	stuw	t_1,rp(6)	!r[6]=c1;
1279	or	c_12,c_3,c_12
1280
1281	mulx	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
1282	addcc	c_12,t_1,c_12
1283	clr	c_3
1284	bcs,a	%xcc,.+8
1285	add	c_3,t_2,c_3
1286	addcc	c_12,t_1,c_12
1287	bcs,a	%xcc,.+8
1288	add	c_3,t_2,c_3
1289	mulx	a_1,a_6,t_1	!sqr_add_c2(a,6,1,c2,c3,c1);
1290	addcc	c_12,t_1,c_12
1291	bcs,a	%xcc,.+8
1292	add	c_3,t_2,c_3
1293	addcc	c_12,t_1,c_12
1294	bcs,a	%xcc,.+8
1295	add	c_3,t_2,c_3
1296	mulx	a_2,a_5,t_1	!sqr_add_c2(a,5,2,c2,c3,c1);
1297	addcc	c_12,t_1,c_12
1298	bcs,a	%xcc,.+8
1299	add	c_3,t_2,c_3
1300	addcc	c_12,t_1,c_12
1301	bcs,a	%xcc,.+8
1302	add	c_3,t_2,c_3
1303	mulx	a_3,a_4,t_1	!sqr_add_c2(a,4,3,c2,c3,c1);
1304	addcc	c_12,t_1,c_12
1305	bcs,a	%xcc,.+8
1306	add	c_3,t_2,c_3
1307	addcc	c_12,t_1,t_1
1308	bcs,a	%xcc,.+8
1309	add	c_3,t_2,c_3
1310	srlx	t_1,32,c_12
1311	stuw	t_1,rp(7)	!r[7]=c2;
1312	or	c_12,c_3,c_12
1313
1314	mulx	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
1315	addcc	c_12,t_1,c_12
1316	clr	c_3
1317	bcs,a	%xcc,.+8
1318	add	c_3,t_2,c_3
1319	addcc	c_12,t_1,c_12
1320	bcs,a	%xcc,.+8
1321	add	c_3,t_2,c_3
1322	mulx	a_6,a_2,t_1	!sqr_add_c2(a,6,2,c3,c1,c2);
1323	addcc	c_12,t_1,c_12
1324	bcs,a	%xcc,.+8
1325	add	c_3,t_2,c_3
1326	addcc	c_12,t_1,c_12
1327	bcs,a	%xcc,.+8
1328	add	c_3,t_2,c_3
1329	mulx	a_5,a_3,t_1	!sqr_add_c2(a,5,3,c3,c1,c2);
1330	addcc	c_12,t_1,c_12
1331	bcs,a	%xcc,.+8
1332	add	c_3,t_2,c_3
1333	addcc	c_12,t_1,c_12
1334	bcs,a	%xcc,.+8
1335	add	c_3,t_2,c_3
1336	mulx	a_4,a_4,t_1	!sqr_add_c(a,4,c3,c1,c2);
1337	addcc	c_12,t_1,t_1
1338	bcs,a	%xcc,.+8
1339	add	c_3,t_2,c_3
1340	srlx	t_1,32,c_12
1341	stuw	t_1,rp(8)	!r[8]=c3;
1342	or	c_12,c_3,c_12
1343
1344	mulx	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
1345	addcc	c_12,t_1,c_12
1346	clr	c_3
1347	bcs,a	%xcc,.+8
1348	add	c_3,t_2,c_3
1349	addcc	c_12,t_1,c_12
1350	bcs,a	%xcc,.+8
1351	add	c_3,t_2,c_3
1352	mulx	a_3,a_6,t_1	!sqr_add_c2(a,6,3,c1,c2,c3);
1353	addcc	c_12,t_1,c_12
1354	bcs,a	%xcc,.+8
1355	add	c_3,t_2,c_3
1356	addcc	c_12,t_1,c_12
1357	bcs,a	%xcc,.+8
1358	add	c_3,t_2,c_3
1359	mulx	a_4,a_5,t_1	!sqr_add_c2(a,5,4,c1,c2,c3);
1360	addcc	c_12,t_1,c_12
1361	bcs,a	%xcc,.+8
1362	add	c_3,t_2,c_3
1363	addcc	c_12,t_1,t_1
1364	bcs,a	%xcc,.+8
1365	add	c_3,t_2,c_3
1366	srlx	t_1,32,c_12
1367	stuw	t_1,rp(9)	!r[9]=c1;
1368	or	c_12,c_3,c_12
1369
1370	mulx	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
1371	addcc	c_12,t_1,c_12
1372	clr	c_3
1373	bcs,a	%xcc,.+8
1374	add	c_3,t_2,c_3
1375	addcc	c_12,t_1,c_12
1376	bcs,a	%xcc,.+8
1377	add	c_3,t_2,c_3
1378	mulx	a_6,a_4,t_1	!sqr_add_c2(a,6,4,c2,c3,c1);
1379	addcc	c_12,t_1,c_12
1380	bcs,a	%xcc,.+8
1381	add	c_3,t_2,c_3
1382	addcc	c_12,t_1,c_12
1383	bcs,a	%xcc,.+8
1384	add	c_3,t_2,c_3
1385	mulx	a_5,a_5,t_1	!sqr_add_c(a,5,c2,c3,c1);
1386	addcc	c_12,t_1,t_1
1387	bcs,a	%xcc,.+8
1388	add	c_3,t_2,c_3
1389	srlx	t_1,32,c_12
1390	stuw	t_1,rp(10)	!r[10]=c2;
1391	or	c_12,c_3,c_12
1392
1393	mulx	a_4,a_7,t_1	!sqr_add_c2(a,7,4,c3,c1,c2);
1394	addcc	c_12,t_1,c_12
1395	clr	c_3
1396	bcs,a	%xcc,.+8
1397	add	c_3,t_2,c_3
1398	addcc	c_12,t_1,c_12
1399	bcs,a	%xcc,.+8
1400	add	c_3,t_2,c_3
1401	mulx	a_5,a_6,t_1	!sqr_add_c2(a,6,5,c3,c1,c2);
1402	addcc	c_12,t_1,c_12
1403	bcs,a	%xcc,.+8
1404	add	c_3,t_2,c_3
1405	addcc	c_12,t_1,t_1
1406	bcs,a	%xcc,.+8
1407	add	c_3,t_2,c_3
1408	srlx	t_1,32,c_12
1409	stuw	t_1,rp(11)	!r[11]=c3;
1410	or	c_12,c_3,c_12
1411
1412	mulx	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
1413	addcc	c_12,t_1,c_12
1414	clr	c_3
1415	bcs,a	%xcc,.+8
1416	add	c_3,t_2,c_3
1417	addcc	c_12,t_1,c_12
1418	bcs,a	%xcc,.+8
1419	add	c_3,t_2,c_3
1420	mulx	a_6,a_6,t_1	!sqr_add_c(a,6,c1,c2,c3);
1421	addcc	c_12,t_1,t_1
1422	bcs,a	%xcc,.+8
1423	add	c_3,t_2,c_3
1424	srlx	t_1,32,c_12
1425	stuw	t_1,rp(12)	!r[12]=c1;
1426	or	c_12,c_3,c_12
1427
1428	mulx	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
1429	addcc	c_12,t_1,c_12
1430	clr	c_3
1431	bcs,a	%xcc,.+8
1432	add	c_3,t_2,c_3
1433	addcc	c_12,t_1,t_1
1434	bcs,a	%xcc,.+8
1435	add	c_3,t_2,c_3
1436	srlx	t_1,32,c_12
1437	stuw	t_1,rp(13)	!r[13]=c2;
1438	or	c_12,c_3,c_12
1439
1440	mulx	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
1441	addcc	c_12,t_1,t_1
1442	srlx	t_1,32,c_12
1443	stuw	t_1,rp(14)	!r[14]=c3;
1444	stuw	c_12,rp(15)	!r[15]=c1;
1445
1446	ret
1447	restore	%g0,%g0,%o0
1448
1449.type	bn_sqr_comba8,#function
1450.size	bn_sqr_comba8,(.-bn_sqr_comba8)
1451
1452.align	32
1453
1454.global bn_sqr_comba4
1455/*
1456 * void bn_sqr_comba4(r,a)
1457 * BN_ULONG *r,*a;
1458 */
1459bn_sqr_comba4:
1460	save	%sp,FRAME_SIZE,%sp
1461	mov	1,t_2
1462	lduw	ap(0),a_0
1463	sllx	t_2,32,t_2
1464	lduw	ap(1),a_1
1465	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
1466	srlx	t_1,32,c_12
1467	stuw	t_1,rp(0)	!r[0]=c1;
1468
1469	lduw	ap(2),a_2
1470	mulx	a_0,a_1,t_1	!sqr_add_c2(a,1,0,c2,c3,c1);
1471	addcc	c_12,t_1,c_12
1472	clr	c_3
1473	bcs,a	%xcc,.+8
1474	add	c_3,t_2,c_3
1475	addcc	c_12,t_1,t_1
1476	bcs,a	%xcc,.+8
1477	add	c_3,t_2,c_3
1478	srlx	t_1,32,c_12
1479	stuw	t_1,rp(1)	!r[1]=c2;
1480	or	c_12,c_3,c_12
1481
1482	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
1483	addcc	c_12,t_1,c_12
1484	clr	c_3
1485	bcs,a	%xcc,.+8
1486	add	c_3,t_2,c_3
1487	addcc	c_12,t_1,c_12
1488	bcs,a	%xcc,.+8
1489	add	c_3,t_2,c_3
1490	lduw	ap(3),a_3
1491	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
1492	addcc	c_12,t_1,t_1
1493	bcs,a	%xcc,.+8
1494	add	c_3,t_2,c_3
1495	srlx	t_1,32,c_12
1496	stuw	t_1,rp(2)	!r[2]=c3;
1497	or	c_12,c_3,c_12
1498
1499	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
1500	addcc	c_12,t_1,c_12
1501	clr	c_3
1502	bcs,a	%xcc,.+8
1503	add	c_3,t_2,c_3
1504	addcc	c_12,t_1,c_12
1505	bcs,a	%xcc,.+8
1506	add	c_3,t_2,c_3
1507	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
1508	addcc	c_12,t_1,c_12
1509	bcs,a	%xcc,.+8
1510	add	c_3,t_2,c_3
1511	addcc	c_12,t_1,t_1
1512	bcs,a	%xcc,.+8
1513	add	c_3,t_2,c_3
1514	srlx	t_1,32,c_12
1515	stuw	t_1,rp(3)	!r[3]=c1;
1516	or	c_12,c_3,c_12
1517
1518	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
1519	addcc	c_12,t_1,c_12
1520	clr	c_3
1521	bcs,a	%xcc,.+8
1522	add	c_3,t_2,c_3
1523	addcc	c_12,t_1,c_12
1524	bcs,a	%xcc,.+8
1525	add	c_3,t_2,c_3
1526	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
1527	addcc	c_12,t_1,t_1
1528	bcs,a	%xcc,.+8
1529	add	c_3,t_2,c_3
1530	srlx	t_1,32,c_12
1531	stuw	t_1,rp(4)	!r[4]=c2;
1532	or	c_12,c_3,c_12
1533
1534	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
1535	addcc	c_12,t_1,c_12
1536	clr	c_3
1537	bcs,a	%xcc,.+8
1538	add	c_3,t_2,c_3
1539	addcc	c_12,t_1,t_1
1540	bcs,a	%xcc,.+8
1541	add	c_3,t_2,c_3
1542	srlx	t_1,32,c_12
1543	stuw	t_1,rp(5)	!r[5]=c3;
1544	or	c_12,c_3,c_12
1545
1546	mulx	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
1547	addcc	c_12,t_1,t_1
1548	srlx	t_1,32,c_12
1549	stuw	t_1,rp(6)	!r[6]=c1;
1550	stuw	c_12,rp(7)	!r[7]=c2;
1551
1552	ret
1553	restore	%g0,%g0,%o0
1554
1555.type	bn_sqr_comba4,#function
1556.size	bn_sqr_comba4,(.-bn_sqr_comba4)
1557
1558.align	32
1559