1.set	mips2
2.rdata
3.asciiz	"mips3.s, Version 1.2"
4.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5
6.text
7.set	noat
8
9.align	5
10.globl	bn_mul_add_words
11.ent	bn_mul_add_words
12bn_mul_add_words:
13	.set	noreorder
14	bgtz	$6,bn_mul_add_words_internal
15	move	$2,$0
16	jr	$31
17	move	$4,$2
18.end	bn_mul_add_words
19
20.align	5
21.ent	bn_mul_add_words_internal
22bn_mul_add_words_internal:
23	.set	reorder
24	li	$3,-4
25	and	$8,$6,$3
26	beqz	$8,.L_bn_mul_add_words_tail
27
28.L_bn_mul_add_words_loop:
29	lw	$12,0($5)
30	multu	$12,$7
31	lw	$13,0($4)
32	lw	$14,4($5)
33	lw	$15,4($4)
34	lw	$8,2*4($5)
35	lw	$9,2*4($4)
36	addu	$13,$2
37	sltu	$2,$13,$2	# All manuals say it "compares 32-bit
38				# values", but it seems to work fine
39				# even on 64-bit registers.
40	mflo	$1
41	mfhi	$12
42	addu	$13,$1
43	addu	$2,$12
44	 multu	$14,$7
45	sltu	$1,$13,$1
46	sw	$13,0($4)
47	addu	$2,$1
48
49	lw	$10,3*4($5)
50	lw	$11,3*4($4)
51	addu	$15,$2
52	sltu	$2,$15,$2
53	mflo	$1
54	mfhi	$14
55	addu	$15,$1
56	addu	$2,$14
57	 multu	$8,$7
58	sltu	$1,$15,$1
59	sw	$15,4($4)
60	addu	$2,$1
61
62	subu	$6,4
63	addu $4,4*4
64	addu $5,4*4
65	addu	$9,$2
66	sltu	$2,$9,$2
67	mflo	$1
68	mfhi	$8
69	addu	$9,$1
70	addu	$2,$8
71	 multu	$10,$7
72	sltu	$1,$9,$1
73	sw	$9,-2*4($4)
74	addu	$2,$1
75
76
77	and	$8,$6,$3
78	addu	$11,$2
79	sltu	$2,$11,$2
80	mflo	$1
81	mfhi	$10
82	addu	$11,$1
83	addu	$2,$10
84	sltu	$1,$11,$1
85	sw	$11,-4($4)
86	.set	noreorder
87	bgtz	$8,.L_bn_mul_add_words_loop
88	addu	$2,$1
89
90	beqz	$6,.L_bn_mul_add_words_return
91	nop
92
93.L_bn_mul_add_words_tail:
94	.set	reorder
95	lw	$12,0($5)
96	multu	$12,$7
97	lw	$13,0($4)
98	subu	$6,1
99	addu	$13,$2
100	sltu	$2,$13,$2
101	mflo	$1
102	mfhi	$12
103	addu	$13,$1
104	addu	$2,$12
105	sltu	$1,$13,$1
106	sw	$13,0($4)
107	addu	$2,$1
108	beqz	$6,.L_bn_mul_add_words_return
109
110	lw	$12,4($5)
111	multu	$12,$7
112	lw	$13,4($4)
113	subu	$6,1
114	addu	$13,$2
115	sltu	$2,$13,$2
116	mflo	$1
117	mfhi	$12
118	addu	$13,$1
119	addu	$2,$12
120	sltu	$1,$13,$1
121	sw	$13,4($4)
122	addu	$2,$1
123	beqz	$6,.L_bn_mul_add_words_return
124
125	lw	$12,2*4($5)
126	multu	$12,$7
127	lw	$13,2*4($4)
128	addu	$13,$2
129	sltu	$2,$13,$2
130	mflo	$1
131	mfhi	$12
132	addu	$13,$1
133	addu	$2,$12
134	sltu	$1,$13,$1
135	sw	$13,2*4($4)
136	addu	$2,$1
137
138.L_bn_mul_add_words_return:
139	.set	noreorder
140	jr	$31
141	move	$4,$2
142.end	bn_mul_add_words_internal
143
144.align	5
145.globl	bn_mul_words
146.ent	bn_mul_words
147bn_mul_words:
148	.set	noreorder
149	bgtz	$6,bn_mul_words_internal
150	move	$2,$0
151	jr	$31
152	move	$4,$2
153.end	bn_mul_words
154
155.align	5
156.ent	bn_mul_words_internal
157bn_mul_words_internal:
158	.set	reorder
159	li	$3,-4
160	and	$8,$6,$3
161	beqz	$8,.L_bn_mul_words_tail
162
163.L_bn_mul_words_loop:
164	lw	$12,0($5)
165	multu	$12,$7
166	lw	$14,4($5)
167	lw	$8,2*4($5)
168	lw	$10,3*4($5)
169	mflo	$1
170	mfhi	$12
171	addu	$2,$1
172	sltu	$13,$2,$1
173	 multu	$14,$7
174	sw	$2,0($4)
175	addu	$2,$13,$12
176
177	subu	$6,4
178	addu $4,4*4
179	addu $5,4*4
180	mflo	$1
181	mfhi	$14
182	addu	$2,$1
183	sltu	$15,$2,$1
184	 multu	$8,$7
185	sw	$2,-3*4($4)
186	addu	$2,$15,$14
187
188	mflo	$1
189	mfhi	$8
190	addu	$2,$1
191	sltu	$9,$2,$1
192	 multu	$10,$7
193	sw	$2,-2*4($4)
194	addu	$2,$9,$8
195
196	and	$8,$6,$3
197	mflo	$1
198	mfhi	$10
199	addu	$2,$1
200	sltu	$11,$2,$1
201	sw	$2,-4($4)
202	.set	noreorder
203	bgtz	$8,.L_bn_mul_words_loop
204	addu	$2,$11,$10
205
206	beqz	$6,.L_bn_mul_words_return
207	nop
208
209.L_bn_mul_words_tail:
210	.set	reorder
211	lw	$12,0($5)
212	multu	$12,$7
213	subu	$6,1
214	mflo	$1
215	mfhi	$12
216	addu	$2,$1
217	sltu	$13,$2,$1
218	sw	$2,0($4)
219	addu	$2,$13,$12
220	beqz	$6,.L_bn_mul_words_return
221
222	lw	$12,4($5)
223	multu	$12,$7
224	subu	$6,1
225	mflo	$1
226	mfhi	$12
227	addu	$2,$1
228	sltu	$13,$2,$1
229	sw	$2,4($4)
230	addu	$2,$13,$12
231	beqz	$6,.L_bn_mul_words_return
232
233	lw	$12,2*4($5)
234	multu	$12,$7
235	mflo	$1
236	mfhi	$12
237	addu	$2,$1
238	sltu	$13,$2,$1
239	sw	$2,2*4($4)
240	addu	$2,$13,$12
241
242.L_bn_mul_words_return:
243	.set	noreorder
244	jr	$31
245	move	$4,$2
246.end	bn_mul_words_internal
247
248.align	5
249.globl	bn_sqr_words
250.ent	bn_sqr_words
251bn_sqr_words:
252	.set	noreorder
253	bgtz	$6,bn_sqr_words_internal
254	move	$2,$0
255	jr	$31
256	move	$4,$2
257.end	bn_sqr_words
258
259.align	5
260.ent	bn_sqr_words_internal
261bn_sqr_words_internal:
262	.set	reorder
263	li	$3,-4
264	and	$8,$6,$3
265	beqz	$8,.L_bn_sqr_words_tail
266
267.L_bn_sqr_words_loop:
268	lw	$12,0($5)
269	multu	$12,$12
270	lw	$14,4($5)
271	lw	$8,2*4($5)
272	lw	$10,3*4($5)
273	mflo	$13
274	mfhi	$12
275	sw	$13,0($4)
276	sw	$12,4($4)
277
278	multu	$14,$14
279	subu	$6,4
280	addu $4,8*4
281	addu $5,4*4
282	mflo	$15
283	mfhi	$14
284	sw	$15,-6*4($4)
285	sw	$14,-5*4($4)
286
287	multu	$8,$8
288	mflo	$9
289	mfhi	$8
290	sw	$9,-4*4($4)
291	sw	$8,-3*4($4)
292
293
294	multu	$10,$10
295	and	$8,$6,$3
296	mflo	$11
297	mfhi	$10
298	sw	$11,-2*4($4)
299
300	.set	noreorder
301	bgtz	$8,.L_bn_sqr_words_loop
302	sw	$10,-4($4)
303
304	beqz	$6,.L_bn_sqr_words_return
305	nop
306
307.L_bn_sqr_words_tail:
308	.set	reorder
309	lw	$12,0($5)
310	multu	$12,$12
311	subu	$6,1
312	mflo	$13
313	mfhi	$12
314	sw	$13,0($4)
315	sw	$12,4($4)
316	beqz	$6,.L_bn_sqr_words_return
317
318	lw	$12,4($5)
319	multu	$12,$12
320	subu	$6,1
321	mflo	$13
322	mfhi	$12
323	sw	$13,2*4($4)
324	sw	$12,3*4($4)
325	beqz	$6,.L_bn_sqr_words_return
326
327	lw	$12,2*4($5)
328	multu	$12,$12
329	mflo	$13
330	mfhi	$12
331	sw	$13,4*4($4)
332	sw	$12,5*4($4)
333
334.L_bn_sqr_words_return:
335	.set	noreorder
336	jr	$31
337	move	$4,$2
338
339.end	bn_sqr_words_internal
340
341.align	5
342.globl	bn_add_words
343.ent	bn_add_words
344bn_add_words:
345	.set	noreorder
346	bgtz	$7,bn_add_words_internal
347	move	$2,$0
348	jr	$31
349	move	$4,$2
350.end	bn_add_words
351
352.align	5
353.ent	bn_add_words_internal
354bn_add_words_internal:
355	.set	reorder
356	li	$3,-4
357	and	$1,$7,$3
358	beqz	$1,.L_bn_add_words_tail
359
360.L_bn_add_words_loop:
361	lw	$12,0($5)
362	lw	$8,0($6)
363	subu	$7,4
364	lw	$13,4($5)
365	and	$1,$7,$3
366	lw	$14,2*4($5)
367	addu $6,4*4
368	lw	$15,3*4($5)
369	addu $4,4*4
370	lw	$9,-3*4($6)
371	addu $5,4*4
372	lw	$10,-2*4($6)
373	lw	$11,-4($6)
374	addu	$8,$12
375	sltu	$24,$8,$12
376	addu	$12,$8,$2
377	sltu	$2,$12,$8
378	sw	$12,-4*4($4)
379	addu	$2,$24
380
381	addu	$9,$13
382	sltu	$25,$9,$13
383	addu	$13,$9,$2
384	sltu	$2,$13,$9
385	sw	$13,-3*4($4)
386	addu	$2,$25
387
388	addu	$10,$14
389	sltu	$24,$10,$14
390	addu	$14,$10,$2
391	sltu	$2,$14,$10
392	sw	$14,-2*4($4)
393	addu	$2,$24
394
395	addu	$11,$15
396	sltu	$25,$11,$15
397	addu	$15,$11,$2
398	sltu	$2,$15,$11
399	sw	$15,-4($4)
400
401	.set	noreorder
402	bgtz	$1,.L_bn_add_words_loop
403	addu	$2,$25
404
405	beqz	$7,.L_bn_add_words_return
406	nop
407
408.L_bn_add_words_tail:
409	.set	reorder
410	lw	$12,0($5)
411	lw	$8,0($6)
412	addu	$8,$12
413	subu	$7,1
414	sltu	$24,$8,$12
415	addu	$12,$8,$2
416	sltu	$2,$12,$8
417	sw	$12,0($4)
418	addu	$2,$24
419	beqz	$7,.L_bn_add_words_return
420
421	lw	$13,4($5)
422	lw	$9,4($6)
423	addu	$9,$13
424	subu	$7,1
425	sltu	$25,$9,$13
426	addu	$13,$9,$2
427	sltu	$2,$13,$9
428	sw	$13,4($4)
429	addu	$2,$25
430	beqz	$7,.L_bn_add_words_return
431
432	lw	$14,2*4($5)
433	lw	$10,2*4($6)
434	addu	$10,$14
435	sltu	$24,$10,$14
436	addu	$14,$10,$2
437	sltu	$2,$14,$10
438	sw	$14,2*4($4)
439	addu	$2,$24
440
441.L_bn_add_words_return:
442	.set	noreorder
443	jr	$31
444	move	$4,$2
445
446.end	bn_add_words_internal
447
448.align	5
449.globl	bn_sub_words
450.ent	bn_sub_words
451bn_sub_words:
452	.set	noreorder
453	bgtz	$7,bn_sub_words_internal
454	move	$2,$0
455	jr	$31
456	move	$4,$0
457.end	bn_sub_words
458
459.align	5
460.ent	bn_sub_words_internal
461bn_sub_words_internal:
462	.set	reorder
463	li	$3,-4
464	and	$1,$7,$3
465	beqz	$1,.L_bn_sub_words_tail
466
467.L_bn_sub_words_loop:
468	lw	$12,0($5)
469	lw	$8,0($6)
470	subu	$7,4
471	lw	$13,4($5)
472	and	$1,$7,$3
473	lw	$14,2*4($5)
474	addu $6,4*4
475	lw	$15,3*4($5)
476	addu $4,4*4
477	lw	$9,-3*4($6)
478	addu $5,4*4
479	lw	$10,-2*4($6)
480	lw	$11,-4($6)
481	sltu	$24,$12,$8
482	subu	$8,$12,$8
483	subu	$12,$8,$2
484	sgtu	$2,$12,$8
485	sw	$12,-4*4($4)
486	addu	$2,$24
487
488	sltu	$25,$13,$9
489	subu	$9,$13,$9
490	subu	$13,$9,$2
491	sgtu	$2,$13,$9
492	sw	$13,-3*4($4)
493	addu	$2,$25
494
495
496	sltu	$24,$14,$10
497	subu	$10,$14,$10
498	subu	$14,$10,$2
499	sgtu	$2,$14,$10
500	sw	$14,-2*4($4)
501	addu	$2,$24
502
503	sltu	$25,$15,$11
504	subu	$11,$15,$11
505	subu	$15,$11,$2
506	sgtu	$2,$15,$11
507	sw	$15,-4($4)
508
509	.set	noreorder
510	bgtz	$1,.L_bn_sub_words_loop
511	addu	$2,$25
512
513	beqz	$7,.L_bn_sub_words_return
514	nop
515
516.L_bn_sub_words_tail:
517	.set	reorder
518	lw	$12,0($5)
519	lw	$8,0($6)
520	subu	$7,1
521	sltu	$24,$12,$8
522	subu	$8,$12,$8
523	subu	$12,$8,$2
524	sgtu	$2,$12,$8
525	sw	$12,0($4)
526	addu	$2,$24
527	beqz	$7,.L_bn_sub_words_return
528
529	lw	$13,4($5)
530	subu	$7,1
531	lw	$9,4($6)
532	sltu	$25,$13,$9
533	subu	$9,$13,$9
534	subu	$13,$9,$2
535	sgtu	$2,$13,$9
536	sw	$13,4($4)
537	addu	$2,$25
538	beqz	$7,.L_bn_sub_words_return
539
540	lw	$14,2*4($5)
541	lw	$10,2*4($6)
542	sltu	$24,$14,$10
543	subu	$10,$14,$10
544	subu	$14,$10,$2
545	sgtu	$2,$14,$10
546	sw	$14,2*4($4)
547	addu	$2,$24
548
549.L_bn_sub_words_return:
550	.set	noreorder
551	jr	$31
552	move	$4,$2
553.end	bn_sub_words_internal
554
555.align 5
556.globl	bn_div_3_words
557.ent	bn_div_3_words
558bn_div_3_words:
559	.set	noreorder
560	move	$7,$4		# we know that bn_div_words does not
561				# touch $7, $10, $11 and preserves $6
562				# so that we can save two arguments
563				# and return address in registers
564				# instead of stack:-)
565
566	lw	$4,($7)
567	move	$10,$5
568	bne	$4,$6,bn_div_3_words_internal
569	lw	$5,-4($7)
570	li	$2,-1
571	jr	$31
572	move	$4,$2
573.end	bn_div_3_words
574
575.align	5
576.ent	bn_div_3_words_internal
577bn_div_3_words_internal:
578	.set	reorder
579	move	$11,$31
580	bal	bn_div_words_internal
581	move	$31,$11
582	multu	$10,$2
583	lw	$14,-2*4($7)
584	move	$8,$0
585	mfhi	$13
586	mflo	$12
587	sltu	$24,$13,$5
588.L_bn_div_3_words_inner_loop:
589	bnez	$24,.L_bn_div_3_words_inner_loop_done
590	sgeu	$1,$14,$12
591	seq	$25,$13,$5
592	and	$1,$25
593	sltu	$15,$12,$10
594	addu	$5,$6
595	subu	$13,$15
596	subu	$12,$10
597	sltu	$24,$13,$5
598	sltu	$8,$5,$6
599	or	$24,$8
600	.set	noreorder
601	beqz	$1,.L_bn_div_3_words_inner_loop
602	subu	$2,1
603	addu	$2,1
604	.set	reorder
605.L_bn_div_3_words_inner_loop_done:
606	.set	noreorder
607	jr	$31
608	move	$4,$2
609.end	bn_div_3_words_internal
610
611.align	5
612.globl	bn_div_words
613.ent	bn_div_words
614bn_div_words:
615	.set	noreorder
616	bnez	$6,bn_div_words_internal
617	li	$2,-1		# I would rather signal div-by-zero
618				# which can be done with 'break 7'
619	jr	$31
620	move	$4,$2
621.end	bn_div_words
622
623.align	5
624.ent	bn_div_words_internal
625bn_div_words_internal:
626	move	$3,$0
627	bltz	$6,.L_bn_div_words_body
628	move	$25,$3
629	sll	$6,1
630	bgtz	$6,.-4
631	addu	$25,1
632
633	.set	reorder
634	negu	$13,$25
635	li	$14,-1
636	sll	$14,$13
637	and	$14,$4
638	srl	$1,$5,$13
639	.set	noreorder
640	beqz	$14,.+12
641	nop
642	break	6		# signal overflow
643	.set	reorder
644	sll	$4,$25
645	sll	$5,$25
646	or	$4,$1
647.L_bn_div_words_body:
648	srl	$3,$6,4*4	# bits
649	sgeu	$1,$4,$6
650	.set	noreorder
651	beqz	$1,.+12
652	nop
653	subu	$4,$6
654	.set	reorder
655
656	li	$8,-1
657	srl	$9,$4,4*4	# bits
658	srl	$8,4*4	# q=0xffffffff
659	beq	$3,$9,.L_bn_div_words_skip_div1
660	divu	$0,$4,$3
661	mflo	$8
662.L_bn_div_words_skip_div1:
663	multu	$6,$8
664	sll	$15,$4,4*4	# bits
665	srl	$1,$5,4*4	# bits
666	or	$15,$1
667	mflo	$12
668	mfhi	$13
669.L_bn_div_words_inner_loop1:
670	sltu	$14,$15,$12
671	seq	$24,$9,$13
672	sltu	$1,$9,$13
673	and	$14,$24
674	sltu	$2,$12,$6
675	or	$1,$14
676	.set	noreorder
677	beqz	$1,.L_bn_div_words_inner_loop1_done
678	subu	$13,$2
679	subu	$12,$6
680	b	.L_bn_div_words_inner_loop1
681	subu	$8,1
682	.set	reorder
683.L_bn_div_words_inner_loop1_done:
684
685	sll	$5,4*4	# bits
686	subu	$4,$15,$12
687	sll	$2,$8,4*4	# bits
688
689	li	$8,-1
690	srl	$9,$4,4*4	# bits
691	srl	$8,4*4	# q=0xffffffff
692	beq	$3,$9,.L_bn_div_words_skip_div2
693	divu	$0,$4,$3
694	mflo	$8
695.L_bn_div_words_skip_div2:
696	multu	$6,$8
697	sll	$15,$4,4*4	# bits
698	srl	$1,$5,4*4	# bits
699	or	$15,$1
700	mflo	$12
701	mfhi	$13
702.L_bn_div_words_inner_loop2:
703	sltu	$14,$15,$12
704	seq	$24,$9,$13
705	sltu	$1,$9,$13
706	and	$14,$24
707	sltu	$3,$12,$6
708	or	$1,$14
709	.set	noreorder
710	beqz	$1,.L_bn_div_words_inner_loop2_done
711	subu	$13,$3
712	subu	$12,$6
713	b	.L_bn_div_words_inner_loop2
714	subu	$8,1
715	.set	reorder
716.L_bn_div_words_inner_loop2_done:
717
718	subu	$4,$15,$12
719	or	$2,$8
720	srl	$3,$4,$25	# $3 contains remainder if anybody wants it
721	srl	$6,$25		# restore $6
722
723	.set	noreorder
724	move	$5,$3
725	jr	$31
726	move	$4,$2
727.end	bn_div_words_internal
728
729.align	5
730.globl	bn_mul_comba8
731.ent	bn_mul_comba8
732bn_mul_comba8:
733	.set	noreorder
734	.frame	$29,6*4,$31
735	.mask	0x003f0000,-4
736	subu $29,6*4
737	sw	$21,5*4($29)
738	sw	$20,4*4($29)
739	sw	$19,3*4($29)
740	sw	$18,2*4($29)
741	sw	$17,1*4($29)
742	sw	$16,0*4($29)
743
744	.set	reorder
745	lw	$12,0($5)	# If compiled with -mips3 option on
746				# R5000 box assembler barks on this
747				# 1ine with "should not have mult/div
748				# as last instruction in bb (R10K
749				# bug)" warning. If anybody out there
750				# has a clue about how to circumvent
751				# this do send me a note.
752				#		<appro@fy.chalmers.se>
753
754	lw	$8,0($6)
755	lw	$13,4($5)
756	lw	$14,2*4($5)
757	multu	$12,$8		# mul_add_c(a[0],b[0],c1,c2,c3);
758	lw	$15,3*4($5)
759	lw	$9,4($6)
760	lw	$10,2*4($6)
761	lw	$11,3*4($6)
762	mflo	$2
763	mfhi	$3
764
765	lw	$16,4*4($5)
766	lw	$18,5*4($5)
767	multu	$12,$9		# mul_add_c(a[0],b[1],c2,c3,c1);
768	lw	$20,6*4($5)
769	lw	$5,7*4($5)
770	lw	$17,4*4($6)
771	lw	$19,5*4($6)
772	mflo	$24
773	mfhi	$25
774	addu	$3,$24
775	sltu	$1,$3,$24
776	multu	$13,$8		# mul_add_c(a[1],b[0],c2,c3,c1);
777	addu	$7,$25,$1
778	lw	$21,6*4($6)
779	lw	$6,7*4($6)
780	sw	$2,0($4)	# r[0]=c1;
781	mflo	$24
782	mfhi	$25
783	addu	$3,$24
784	sltu	$1,$3,$24
785	 multu	$14,$8		# mul_add_c(a[2],b[0],c3,c1,c2);
786	addu	$25,$1
787	addu	$7,$25
788	sltu	$2,$7,$25
789	sw	$3,4($4)	# r[1]=c2;
790
791	mflo	$24
792	mfhi	$25
793	addu	$7,$24
794	sltu	$1,$7,$24
795	multu	$13,$9		# mul_add_c(a[1],b[1],c3,c1,c2);
796	addu	$25,$1
797	addu	$2,$25
798	mflo	$24
799	mfhi	$25
800	addu	$7,$24
801	sltu	$1,$7,$24
802	multu	$12,$10		# mul_add_c(a[0],b[2],c3,c1,c2);
803	addu	$25,$1
804	addu	$2,$25
805	sltu	$3,$2,$25
806	mflo	$24
807	mfhi	$25
808	addu	$7,$24
809	sltu	$1,$7,$24
810	 multu	$12,$11		# mul_add_c(a[0],b[3],c1,c2,c3);
811	addu	$25,$1
812	addu	$2,$25
813	sltu	$1,$2,$25
814	addu	$3,$1
815	sw	$7,2*4($4)	# r[2]=c3;
816
817	mflo	$24
818	mfhi	$25
819	addu	$2,$24
820	sltu	$1,$2,$24
821	multu	$13,$10		# mul_add_c(a[1],b[2],c1,c2,c3);
822	addu	$25,$1
823	addu	$3,$25
824	sltu	$7,$3,$25
825	mflo	$24
826	mfhi	$25
827	addu	$2,$24
828	sltu	$1,$2,$24
829	multu	$14,$9		# mul_add_c(a[2],b[1],c1,c2,c3);
830	addu	$25,$1
831	addu	$3,$25
832	sltu	$1,$3,$25
833	addu	$7,$1
834	mflo	$24
835	mfhi	$25
836	addu	$2,$24
837	sltu	$1,$2,$24
838	multu	$15,$8		# mul_add_c(a[3],b[0],c1,c2,c3);
839	addu	$25,$1
840	addu	$3,$25
841	sltu	$1,$3,$25
842	addu	$7,$1
843	mflo	$24
844	mfhi	$25
845	addu	$2,$24
846	sltu	$1,$2,$24
847	 multu	$16,$8		# mul_add_c(a[4],b[0],c2,c3,c1);
848	addu	$25,$1
849	addu	$3,$25
850	sltu	$1,$3,$25
851	addu	$7,$1
852	sw	$2,3*4($4)	# r[3]=c1;
853
854	mflo	$24
855	mfhi	$25
856	addu	$3,$24
857	sltu	$1,$3,$24
858	multu	$15,$9		# mul_add_c(a[3],b[1],c2,c3,c1);
859	addu	$25,$1
860	addu	$7,$25
861	sltu	$2,$7,$25
862	mflo	$24
863	mfhi	$25
864	addu	$3,$24
865	sltu	$1,$3,$24
866	multu	$14,$10		# mul_add_c(a[2],b[2],c2,c3,c1);
867	addu	$25,$1
868	addu	$7,$25
869	sltu	$1,$7,$25
870	addu	$2,$1
871	mflo	$24
872	mfhi	$25
873	addu	$3,$24
874	sltu	$1,$3,$24
875	multu	$13,$11		# mul_add_c(a[1],b[3],c2,c3,c1);
876	addu	$25,$1
877	addu	$7,$25
878	sltu	$1,$7,$25
879	addu	$2,$1
880	mflo	$24
881	mfhi	$25
882	addu	$3,$24
883	sltu	$1,$3,$24
884	multu	$12,$17		# mul_add_c(a[0],b[4],c2,c3,c1);
885	addu	$25,$1
886	addu	$7,$25
887	sltu	$1,$7,$25
888	addu	$2,$1
889	mflo	$24
890	mfhi	$25
891	addu	$3,$24
892	sltu	$1,$3,$24
893	 multu	$12,$19		# mul_add_c(a[0],b[5],c3,c1,c2);
894	addu	$25,$1
895	addu	$7,$25
896	sltu	$1,$7,$25
897	addu	$2,$1
898	sw	$3,4*4($4)	# r[4]=c2;
899
900	mflo	$24
901	mfhi	$25
902	addu	$7,$24
903	sltu	$1,$7,$24
904	multu	$13,$17		# mul_add_c(a[1],b[4],c3,c1,c2);
905	addu	$25,$1
906	addu	$2,$25
907	sltu	$3,$2,$25
908	mflo	$24
909	mfhi	$25
910	addu	$7,$24
911	sltu	$1,$7,$24
912	multu	$14,$11		# mul_add_c(a[2],b[3],c3,c1,c2);
913	addu	$25,$1
914	addu	$2,$25
915	sltu	$1,$2,$25
916	addu	$3,$1
917	mflo	$24
918	mfhi	$25
919	addu	$7,$24
920	sltu	$1,$7,$24
921	multu	$15,$10		# mul_add_c(a[3],b[2],c3,c1,c2);
922	addu	$25,$1
923	addu	$2,$25
924	sltu	$1,$2,$25
925	addu	$3,$1
926	mflo	$24
927	mfhi	$25
928	addu	$7,$24
929	sltu	$1,$7,$24
930	multu	$16,$9		# mul_add_c(a[4],b[1],c3,c1,c2);
931	addu	$25,$1
932	addu	$2,$25
933	sltu	$1,$2,$25
934	addu	$3,$1
935	mflo	$24
936	mfhi	$25
937	addu	$7,$24
938	sltu	$1,$7,$24
939	multu	$18,$8		# mul_add_c(a[5],b[0],c3,c1,c2);
940	addu	$25,$1
941	addu	$2,$25
942	sltu	$1,$2,$25
943	addu	$3,$1
944	mflo	$24
945	mfhi	$25
946	addu	$7,$24
947	sltu	$1,$7,$24
948	 multu	$20,$8		# mul_add_c(a[6],b[0],c1,c2,c3);
949	addu	$25,$1
950	addu	$2,$25
951	sltu	$1,$2,$25
952	addu	$3,$1
953	sw	$7,5*4($4)	# r[5]=c3;
954
955	mflo	$24
956	mfhi	$25
957	addu	$2,$24
958	sltu	$1,$2,$24
959	multu	$18,$9		# mul_add_c(a[5],b[1],c1,c2,c3);
960	addu	$25,$1
961	addu	$3,$25
962	sltu	$7,$3,$25
963	mflo	$24
964	mfhi	$25
965	addu	$2,$24
966	sltu	$1,$2,$24
967	multu	$16,$10		# mul_add_c(a[4],b[2],c1,c2,c3);
968	addu	$25,$1
969	addu	$3,$25
970	sltu	$1,$3,$25
971	addu	$7,$1
972	mflo	$24
973	mfhi	$25
974	addu	$2,$24
975	sltu	$1,$2,$24
976	multu	$15,$11		# mul_add_c(a[3],b[3],c1,c2,c3);
977	addu	$25,$1
978	addu	$3,$25
979	sltu	$1,$3,$25
980	addu	$7,$1
981	mflo	$24
982	mfhi	$25
983	addu	$2,$24
984	sltu	$1,$2,$24
985	multu	$14,$17		# mul_add_c(a[2],b[4],c1,c2,c3);
986	addu	$25,$1
987	addu	$3,$25
988	sltu	$1,$3,$25
989	addu	$7,$1
990	mflo	$24
991	mfhi	$25
992	addu	$2,$24
993	sltu	$1,$2,$24
994	multu	$13,$19		# mul_add_c(a[1],b[5],c1,c2,c3);
995	addu	$25,$1
996	addu	$3,$25
997	sltu	$1,$3,$25
998	addu	$7,$1
999	mflo	$24
1000	mfhi	$25
1001	addu	$2,$24
1002	sltu	$1,$2,$24
1003	multu	$12,$21		# mul_add_c(a[0],b[6],c1,c2,c3);
1004	addu	$25,$1
1005	addu	$3,$25
1006	sltu	$1,$3,$25
1007	addu	$7,$1
1008	mflo	$24
1009	mfhi	$25
1010	addu	$2,$24
1011	sltu	$1,$2,$24
1012	 multu	$12,$6		# mul_add_c(a[0],b[7],c2,c3,c1);
1013	addu	$25,$1
1014	addu	$3,$25
1015	sltu	$1,$3,$25
1016	addu	$7,$1
1017	sw	$2,6*4($4)	# r[6]=c1;
1018
1019	mflo	$24
1020	mfhi	$25
1021	addu	$3,$24
1022	sltu	$1,$3,$24
1023	multu	$13,$21		# mul_add_c(a[1],b[6],c2,c3,c1);
1024	addu	$25,$1
1025	addu	$7,$25
1026	sltu	$2,$7,$25
1027	mflo	$24
1028	mfhi	$25
1029	addu	$3,$24
1030	sltu	$1,$3,$24
1031	multu	$14,$19		# mul_add_c(a[2],b[5],c2,c3,c1);
1032	addu	$25,$1
1033	addu	$7,$25
1034	sltu	$1,$7,$25
1035	addu	$2,$1
1036	mflo	$24
1037	mfhi	$25
1038	addu	$3,$24
1039	sltu	$1,$3,$24
1040	multu	$15,$17		# mul_add_c(a[3],b[4],c2,c3,c1);
1041	addu	$25,$1
1042	addu	$7,$25
1043	sltu	$1,$7,$25
1044	addu	$2,$1
1045	mflo	$24
1046	mfhi	$25
1047	addu	$3,$24
1048	sltu	$1,$3,$24
1049	multu	$16,$11		# mul_add_c(a[4],b[3],c2,c3,c1);
1050	addu	$25,$1
1051	addu	$7,$25
1052	sltu	$1,$7,$25
1053	addu	$2,$1
1054	mflo	$24
1055	mfhi	$25
1056	addu	$3,$24
1057	sltu	$1,$3,$24
1058	multu	$18,$10		# mul_add_c(a[5],b[2],c2,c3,c1);
1059	addu	$25,$1
1060	addu	$7,$25
1061	sltu	$1,$7,$25
1062	addu	$2,$1
1063	mflo	$24
1064	mfhi	$25
1065	addu	$3,$24
1066	sltu	$1,$3,$24
1067	multu	$20,$9		# mul_add_c(a[6],b[1],c2,c3,c1);
1068	addu	$25,$1
1069	addu	$7,$25
1070	sltu	$1,$7,$25
1071	addu	$2,$1
1072	mflo	$24
1073	mfhi	$25
1074	addu	$3,$24
1075	sltu	$1,$3,$24
1076	multu	$5,$8		# mul_add_c(a[7],b[0],c2,c3,c1);
1077	addu	$25,$1
1078	addu	$7,$25
1079	sltu	$1,$7,$25
1080	addu	$2,$1
1081	mflo	$24
1082	mfhi	$25
1083	addu	$3,$24
1084	sltu	$1,$3,$24
1085	 multu	$5,$9		# mul_add_c(a[7],b[1],c3,c1,c2);
1086	addu	$25,$1
1087	addu	$7,$25
1088	sltu	$1,$7,$25
1089	addu	$2,$1
1090	sw	$3,7*4($4)	# r[7]=c2;
1091
1092	mflo	$24
1093	mfhi	$25
1094	addu	$7,$24
1095	sltu	$1,$7,$24
1096	multu	$20,$10		# mul_add_c(a[6],b[2],c3,c1,c2);
1097	addu	$25,$1
1098	addu	$2,$25
1099	sltu	$3,$2,$25
1100	mflo	$24
1101	mfhi	$25
1102	addu	$7,$24
1103	sltu	$1,$7,$24
1104	multu	$18,$11		# mul_add_c(a[5],b[3],c3,c1,c2);
1105	addu	$25,$1
1106	addu	$2,$25
1107	sltu	$1,$2,$25
1108	addu	$3,$1
1109	mflo	$24
1110	mfhi	$25
1111	addu	$7,$24
1112	sltu	$1,$7,$24
1113	multu	$16,$17		# mul_add_c(a[4],b[4],c3,c1,c2);
1114	addu	$25,$1
1115	addu	$2,$25
1116	sltu	$1,$2,$25
1117	addu	$3,$1
1118	mflo	$24
1119	mfhi	$25
1120	addu	$7,$24
1121	sltu	$1,$7,$24
1122	multu	$15,$19		# mul_add_c(a[3],b[5],c3,c1,c2);
1123	addu	$25,$1
1124	addu	$2,$25
1125	sltu	$1,$2,$25
1126	addu	$3,$1
1127	mflo	$24
1128	mfhi	$25
1129	addu	$7,$24
1130	sltu	$1,$7,$24
1131	multu	$14,$21		# mul_add_c(a[2],b[6],c3,c1,c2);
1132	addu	$25,$1
1133	addu	$2,$25
1134	sltu	$1,$2,$25
1135	addu	$3,$1
1136	mflo	$24
1137	mfhi	$25
1138	addu	$7,$24
1139	sltu	$1,$7,$24
1140	multu	$13,$6		# mul_add_c(a[1],b[7],c3,c1,c2);
1141	addu	$25,$1
1142	addu	$2,$25
1143	sltu	$1,$2,$25
1144	addu	$3,$1
1145	mflo	$24
1146	mfhi	$25
1147	addu	$7,$24
1148	sltu	$1,$7,$24
1149	 multu	$14,$6		# mul_add_c(a[2],b[7],c1,c2,c3);
1150	addu	$25,$1
1151	addu	$2,$25
1152	sltu	$1,$2,$25
1153	addu	$3,$1
1154	sw	$7,8*4($4)	# r[8]=c3;
1155
1156	mflo	$24
1157	mfhi	$25
1158	addu	$2,$24
1159	sltu	$1,$2,$24
1160	multu	$15,$21		# mul_add_c(a[3],b[6],c1,c2,c3);
1161	addu	$25,$1
1162	addu	$3,$25
1163	sltu	$7,$3,$25
1164	mflo	$24
1165	mfhi	$25
1166	addu	$2,$24
1167	sltu	$1,$2,$24
1168	multu	$16,$19		# mul_add_c(a[4],b[5],c1,c2,c3);
1169	addu	$25,$1
1170	addu	$3,$25
1171	sltu	$1,$3,$25
1172	addu	$7,$1
1173	mflo	$24
1174	mfhi	$25
1175	addu	$2,$24
1176	sltu	$1,$2,$24
1177	multu	$18,$17		# mul_add_c(a[5],b[4],c1,c2,c3);
1178	addu	$25,$1
1179	addu	$3,$25
1180	sltu	$1,$3,$25
1181	addu	$7,$1
1182	mflo	$24
1183	mfhi	$25
1184	addu	$2,$24
1185	sltu	$1,$2,$24
1186	multu	$20,$11		# mul_add_c(a[6],b[3],c1,c2,c3);
1187	addu	$25,$1
1188	addu	$3,$25
1189	sltu	$1,$3,$25
1190	addu	$7,$1
1191	mflo	$24
1192	mfhi	$25
1193	addu	$2,$24
1194	sltu	$1,$2,$24
1195	multu	$5,$10		# mul_add_c(a[7],b[2],c1,c2,c3);
1196	addu	$25,$1
1197	addu	$3,$25
1198	sltu	$1,$3,$25
1199	addu	$7,$1
1200	mflo	$24
1201	mfhi	$25
1202	addu	$2,$24
1203	sltu	$1,$2,$24
1204	 multu	$5,$11		# mul_add_c(a[7],b[3],c2,c3,c1);
1205	addu	$25,$1
1206	addu	$3,$25
1207	sltu	$1,$3,$25
1208	addu	$7,$1
1209	sw	$2,9*4($4)	# r[9]=c1;
1210
1211	mflo	$24
1212	mfhi	$25
1213	addu	$3,$24
1214	sltu	$1,$3,$24
1215	multu	$20,$17		# mul_add_c(a[6],b[4],c2,c3,c1);
1216	addu	$25,$1
1217	addu	$7,$25
1218	sltu	$2,$7,$25
1219	mflo	$24
1220	mfhi	$25
1221	addu	$3,$24
1222	sltu	$1,$3,$24
1223	multu	$18,$19		# mul_add_c(a[5],b[5],c2,c3,c1);
1224	addu	$25,$1
1225	addu	$7,$25
1226	sltu	$1,$7,$25
1227	addu	$2,$1
1228	mflo	$24
1229	mfhi	$25
1230	addu	$3,$24
1231	sltu	$1,$3,$24
1232	multu	$16,$21		# mul_add_c(a[4],b[6],c2,c3,c1);
1233	addu	$25,$1
1234	addu	$7,$25
1235	sltu	$1,$7,$25
1236	addu	$2,$1
1237	mflo	$24
1238	mfhi	$25
1239	addu	$3,$24
1240	sltu	$1,$3,$24
1241	multu	$15,$6		# mul_add_c(a[3],b[7],c2,c3,c1);
1242	addu	$25,$1
1243	addu	$7,$25
1244	sltu	$1,$7,$25
1245	addu	$2,$1
1246	mflo	$24
1247	mfhi	$25
1248	addu	$3,$24
1249	sltu	$1,$3,$24
1250	multu	$16,$6		# mul_add_c(a[4],b[7],c3,c1,c2);
1251	addu	$25,$1
1252	addu	$7,$25
1253	sltu	$1,$7,$25
1254	addu	$2,$1
1255	sw	$3,10*4($4)	# r[10]=c2;
1256
1257	mflo	$24
1258	mfhi	$25
1259	addu	$7,$24
1260	sltu	$1,$7,$24
1261	multu	$18,$21		# mul_add_c(a[5],b[6],c3,c1,c2);
1262	addu	$25,$1
1263	addu	$2,$25
1264	sltu	$3,$2,$25
1265	mflo	$24
1266	mfhi	$25
1267	addu	$7,$24
1268	sltu	$1,$7,$24
1269	multu	$20,$19		# mul_add_c(a[6],b[5],c3,c1,c2);
1270	addu	$25,$1
1271	addu	$2,$25
1272	sltu	$1,$2,$25
1273	addu	$3,$1
1274	mflo	$24
1275	mfhi	$25
1276	addu	$7,$24
1277	sltu	$1,$7,$24
1278	multu	$5,$17		# mul_add_c(a[7],b[4],c3,c1,c2);
1279	addu	$25,$1
1280	addu	$2,$25
1281	sltu	$1,$2,$25
1282	addu	$3,$1
1283	mflo	$24
1284	mfhi	$25
1285	addu	$7,$24
1286	sltu	$1,$7,$24
1287	 multu	$5,$19		# mul_add_c(a[7],b[5],c1,c2,c3);
1288	addu	$25,$1
1289	addu	$2,$25
1290	sltu	$1,$2,$25
1291	addu	$3,$1
1292	sw	$7,11*4($4)	# r[11]=c3;
1293
1294	mflo	$24
1295	mfhi	$25
1296	addu	$2,$24
1297	sltu	$1,$2,$24
1298	multu	$20,$21		# mul_add_c(a[6],b[6],c1,c2,c3);
1299	addu	$25,$1
1300	addu	$3,$25
1301	sltu	$7,$3,$25
1302	mflo	$24
1303	mfhi	$25
1304	addu	$2,$24
1305	sltu	$1,$2,$24
1306	multu	$18,$6		# mul_add_c(a[5],b[7],c1,c2,c3);
1307	addu	$25,$1
1308	addu	$3,$25
1309	sltu	$1,$3,$25
1310	addu	$7,$1
1311	mflo	$24
1312	mfhi	$25
1313	addu	$2,$24
1314	sltu	$1,$2,$24
1315	 multu	$20,$6		# mul_add_c(a[6],b[7],c2,c3,c1);
1316	addu	$25,$1
1317	addu	$3,$25
1318	sltu	$1,$3,$25
1319	addu	$7,$1
1320	sw	$2,12*4($4)	# r[12]=c1;
1321
1322	mflo	$24
1323	mfhi	$25
1324	addu	$3,$24
1325	sltu	$1,$3,$24
1326	multu	$5,$21		# mul_add_c(a[7],b[6],c2,c3,c1);
1327	addu	$25,$1
1328	addu	$7,$25
1329	sltu	$2,$7,$25
1330	mflo	$24
1331	mfhi	$25
1332	addu	$3,$24
1333	sltu	$1,$3,$24
1334	multu	$5,$6		# mul_add_c(a[7],b[7],c3,c1,c2);
1335	addu	$25,$1
1336	addu	$7,$25
1337	sltu	$1,$7,$25
1338	addu	$2,$1
1339	sw	$3,13*4($4)	# r[13]=c2;
1340
1341	mflo	$24
1342	mfhi	$25
1343	addu	$7,$24
1344	sltu	$1,$7,$24
1345	addu	$25,$1
1346	addu	$2,$25
1347	sw	$7,14*4($4)	# r[14]=c3;
1348	sw	$2,15*4($4)	# r[15]=c1;
1349
1350	.set	noreorder
1351	lw	$21,5*4($29)
1352	lw	$20,4*4($29)
1353	lw	$19,3*4($29)
1354	lw	$18,2*4($29)
1355	lw	$17,1*4($29)
1356	lw	$16,0*4($29)
1357	jr	$31
1358	addu $29,6*4
1359.end	bn_mul_comba8
1360
1361.align	5
1362.globl	bn_mul_comba4
1363.ent	bn_mul_comba4
1364bn_mul_comba4:
1365	.set	reorder
1366	lw	$12,0($5)
1367	lw	$8,0($6)
1368	lw	$13,4($5)
1369	lw	$14,2*4($5)
1370	multu	$12,$8		# mul_add_c(a[0],b[0],c1,c2,c3);
1371	lw	$15,3*4($5)
1372	lw	$9,4($6)
1373	lw	$10,2*4($6)
1374	lw	$11,3*4($6)
1375	mflo	$2
1376	mfhi	$3
1377	sw	$2,0($4)
1378
1379	multu	$12,$9		# mul_add_c(a[0],b[1],c2,c3,c1);
1380	mflo	$24
1381	mfhi	$25
1382	addu	$3,$24
1383	sltu	$1,$3,$24
1384	multu	$13,$8		# mul_add_c(a[1],b[0],c2,c3,c1);
1385	addu	$7,$25,$1
1386	mflo	$24
1387	mfhi	$25
1388	addu	$3,$24
1389	sltu	$1,$3,$24
1390	 multu	$14,$8		# mul_add_c(a[2],b[0],c3,c1,c2);
1391	addu	$25,$1
1392	addu	$7,$25
1393	sltu	$2,$7,$25
1394	sw	$3,4($4)
1395
1396	mflo	$24
1397	mfhi	$25
1398	addu	$7,$24
1399	sltu	$1,$7,$24
1400	multu	$13,$9		# mul_add_c(a[1],b[1],c3,c1,c2);
1401	addu	$25,$1
1402	addu	$2,$25
1403	mflo	$24
1404	mfhi	$25
1405	addu	$7,$24
1406	sltu	$1,$7,$24
1407	multu	$12,$10		# mul_add_c(a[0],b[2],c3,c1,c2);
1408	addu	$25,$1
1409	addu	$2,$25
1410	sltu	$3,$2,$25
1411	mflo	$24
1412	mfhi	$25
1413	addu	$7,$24
1414	sltu	$1,$7,$24
1415	 multu	$12,$11		# mul_add_c(a[0],b[3],c1,c2,c3);
1416	addu	$25,$1
1417	addu	$2,$25
1418	sltu	$1,$2,$25
1419	addu	$3,$1
1420	sw	$7,2*4($4)
1421
1422	mflo	$24
1423	mfhi	$25
1424	addu	$2,$24
1425	sltu	$1,$2,$24
1426	multu	$13,$10		# mul_add_c(a[1],b[2],c1,c2,c3);
1427	addu	$25,$1
1428	addu	$3,$25
1429	sltu	$7,$3,$25
1430	mflo	$24
1431	mfhi	$25
1432	addu	$2,$24
1433	sltu	$1,$2,$24
1434	multu	$14,$9		# mul_add_c(a[2],b[1],c1,c2,c3);
1435	addu	$25,$1
1436	addu	$3,$25
1437	sltu	$1,$3,$25
1438	addu	$7,$1
1439	mflo	$24
1440	mfhi	$25
1441	addu	$2,$24
1442	sltu	$1,$2,$24
1443	multu	$15,$8		# mul_add_c(a[3],b[0],c1,c2,c3);
1444	addu	$25,$1
1445	addu	$3,$25
1446	sltu	$1,$3,$25
1447	addu	$7,$1
1448	mflo	$24
1449	mfhi	$25
1450	addu	$2,$24
1451	sltu	$1,$2,$24
1452	 multu	$15,$9		# mul_add_c(a[3],b[1],c2,c3,c1);
1453	addu	$25,$1
1454	addu	$3,$25
1455	sltu	$1,$3,$25
1456	addu	$7,$1
1457	sw	$2,3*4($4)
1458
1459	mflo	$24
1460	mfhi	$25
1461	addu	$3,$24
1462	sltu	$1,$3,$24
1463	multu	$14,$10		# mul_add_c(a[2],b[2],c2,c3,c1);
1464	addu	$25,$1
1465	addu	$7,$25
1466	sltu	$2,$7,$25
1467	mflo	$24
1468	mfhi	$25
1469	addu	$3,$24
1470	sltu	$1,$3,$24
1471	multu	$13,$11		# mul_add_c(a[1],b[3],c2,c3,c1);
1472	addu	$25,$1
1473	addu	$7,$25
1474	sltu	$1,$7,$25
1475	addu	$2,$1
1476	mflo	$24
1477	mfhi	$25
1478	addu	$3,$24
1479	sltu	$1,$3,$24
1480	 multu	$14,$11		# mul_add_c(a[2],b[3],c3,c1,c2);
1481	addu	$25,$1
1482	addu	$7,$25
1483	sltu	$1,$7,$25
1484	addu	$2,$1
1485	sw	$3,4*4($4)
1486
1487	mflo	$24
1488	mfhi	$25
1489	addu	$7,$24
1490	sltu	$1,$7,$24
1491	multu	$15,$10		# mul_add_c(a[3],b[2],c3,c1,c2);
1492	addu	$25,$1
1493	addu	$2,$25
1494	sltu	$3,$2,$25
1495	mflo	$24
1496	mfhi	$25
1497	addu	$7,$24
1498	sltu	$1,$7,$24
1499	 multu	$15,$11		# mul_add_c(a[3],b[3],c1,c2,c3);
1500	addu	$25,$1
1501	addu	$2,$25
1502	sltu	$1,$2,$25
1503	addu	$3,$1
1504	sw	$7,5*4($4)
1505
1506	mflo	$24
1507	mfhi	$25
1508	addu	$2,$24
1509	sltu	$1,$2,$24
1510	addu	$25,$1
1511	addu	$3,$25
1512	sw	$2,6*4($4)
1513	sw	$3,7*4($4)
1514
1515	.set	noreorder
1516	jr	$31
1517	nop
1518.end	bn_mul_comba4
1519
1520.align	5
1521.globl	bn_sqr_comba8
1522.ent	bn_sqr_comba8
1523bn_sqr_comba8:
1524	.set	reorder
1525	lw	$12,0($5)
1526	lw	$13,4($5)
1527	lw	$14,2*4($5)
1528	lw	$15,3*4($5)
1529
1530	multu	$12,$12		# mul_add_c(a[0],b[0],c1,c2,c3);
1531	lw	$8,4*4($5)
1532	lw	$9,5*4($5)
1533	lw	$10,6*4($5)
1534	lw	$11,7*4($5)
1535	mflo	$2
1536	mfhi	$3
1537	sw	$2,0($4)
1538
1539	multu	$12,$13		# mul_add_c2(a[0],b[1],c2,c3,c1);
1540	mflo	$24
1541	mfhi	$25
1542	slt	$2,$25,$0
1543	sll	$25,1
1544	 multu	$14,$12		# mul_add_c2(a[2],b[0],c3,c1,c2);
1545	slt	$6,$24,$0
1546	addu	$25,$6
1547	sll	$24,1
1548	addu	$3,$24
1549	sltu	$1,$3,$24
1550	addu	$7,$25,$1
1551	sw	$3,4($4)
1552
1553	mflo	$24
1554	mfhi	$25
1555	slt	$3,$25,$0
1556	sll	$25,1
1557	multu	$13,$13		# mul_add_c(a[1],b[1],c3,c1,c2);
1558	slt	$6,$24,$0
1559	addu	$25,$6
1560	sll	$24,1
1561	addu	$7,$24
1562	sltu	$1,$7,$24
1563	addu	$25,$1
1564	addu	$2,$25
1565	sltu	$1,$2,$25
1566	addu	$3,$1
1567	mflo	$24
1568	mfhi	$25
1569	addu	$7,$24
1570	sltu	$1,$7,$24
1571	 multu	$12,$15		# mul_add_c2(a[0],b[3],c1,c2,c3);
1572	addu	$25,$1
1573	addu	$2,$25
1574	sltu	$1,$2,$25
1575	addu	$3,$1
1576	sw	$7,2*4($4)
1577
1578	mflo	$24
1579	mfhi	$25
1580	slt	$7,$25,$0
1581	sll	$25,1
1582	multu	$13,$14		# mul_add_c2(a[1],b[2],c1,c2,c3);
1583	slt	$6,$24,$0
1584	addu	$25,$6
1585	sll	$24,1
1586	addu	$2,$24
1587	sltu	$1,$2,$24
1588	addu	$25,$1
1589	addu	$3,$25
1590	sltu	$1,$3,$25
1591	addu	$7,$1
1592	mflo	$24
1593	mfhi	$25
1594	slt	$1,$25,$0
1595	addu	$7,$1
1596	 multu	$8,$12		# mul_add_c2(a[4],b[0],c2,c3,c1);
1597	sll	$25,1
1598	slt	$6,$24,$0
1599	addu	$25,$6
1600	sll	$24,1
1601	addu	$2,$24
1602	sltu	$1,$2,$24
1603	addu	$25,$1
1604	addu	$3,$25
1605	sltu	$1,$3,$25
1606	addu	$7,$1
1607	sw	$2,3*4($4)
1608
1609	mflo	$24
1610	mfhi	$25
1611	slt	$2,$25,$0
1612	sll	$25,1
1613	multu	$15,$13		# mul_add_c2(a[3],b[1],c2,c3,c1);
1614	slt	$6,$24,$0
1615	addu	$25,$6
1616	sll	$24,1
1617	addu	$3,$24
1618	sltu	$1,$3,$24
1619	addu	$25,$1
1620	addu	$7,$25
1621	sltu	$1,$7,$25
1622	addu	$2,$1
1623	mflo	$24
1624	mfhi	$25
1625	slt	$1,$25,$0
1626	addu	$2,$1
1627	multu	$14,$14		# mul_add_c(a[2],b[2],c2,c3,c1);
1628	sll	$25,1
1629	slt	$6,$24,$0
1630	addu	$25,$6
1631	sll	$24,1
1632	addu	$3,$24
1633	sltu	$1,$3,$24
1634	addu	$25,$1
1635	addu	$7,$25
1636	sltu	$1,$7,$25
1637	addu	$2,$1
1638	mflo	$24
1639	mfhi	$25
1640	addu	$3,$24
1641	sltu	$1,$3,$24
1642	 multu	$12,$9		# mul_add_c2(a[0],b[5],c3,c1,c2);
1643	addu	$25,$1
1644	addu	$7,$25
1645	sltu	$1,$7,$25
1646	addu	$2,$1
1647	sw	$3,4*4($4)
1648
1649	mflo	$24
1650	mfhi	$25
1651	slt	$3,$25,$0
1652	sll	$25,1
1653	multu	$13,$8		# mul_add_c2(a[1],b[4],c3,c1,c2);
1654	slt	$6,$24,$0
1655	addu	$25,$6
1656	sll	$24,1
1657	addu	$7,$24
1658	sltu	$1,$7,$24
1659	addu	$25,$1
1660	addu	$2,$25
1661	sltu	$1,$2,$25
1662	addu	$3,$1
1663	mflo	$24
1664	mfhi	$25
1665	slt	$1,$25,$0
1666	addu	$3,$1
1667	multu	$14,$15		# mul_add_c2(a[2],b[3],c3,c1,c2);
1668	sll	$25,1
1669	slt	$6,$24,$0
1670	addu	$25,$6
1671	sll	$24,1
1672	addu	$7,$24
1673	sltu	$1,$7,$24
1674	addu	$25,$1
1675	addu	$2,$25
1676	sltu	$1,$2,$25
1677	addu	$3,$1
1678	mflo	$24
1679	mfhi	$25
1680	slt	$1,$25,$0
1681	 multu	$10,$12		# mul_add_c2(a[6],b[0],c1,c2,c3);
1682	addu	$3,$1
1683	sll	$25,1
1684	slt	$6,$24,$0
1685	addu	$25,$6
1686	sll	$24,1
1687	addu	$7,$24
1688	sltu	$1,$7,$24
1689	addu	$25,$1
1690	addu	$2,$25
1691	sltu	$1,$2,$25
1692	addu	$3,$1
1693	sw	$7,5*4($4)
1694
1695	mflo	$24
1696	mfhi	$25
1697	slt	$7,$25,$0
1698	sll	$25,1
1699	multu	$9,$13		# mul_add_c2(a[5],b[1],c1,c2,c3);
1700	slt	$6,$24,$0
1701	addu	$25,$6
1702	sll	$24,1
1703	addu	$2,$24
1704	sltu	$1,$2,$24
1705	addu	$25,$1
1706	addu	$3,$25
1707	sltu	$1,$3,$25
1708	addu	$7,$1
1709	mflo	$24
1710	mfhi	$25
1711	slt	$1,$25,$0
1712	addu	$7,$1
1713	multu	$8,$14		# mul_add_c2(a[4],b[2],c1,c2,c3);
1714	sll	$25,1
1715	slt	$6,$24,$0
1716	addu	$25,$6
1717	sll	$24,1
1718	addu	$2,$24
1719	sltu	$1,$2,$24
1720	addu	$25,$1
1721	addu	$3,$25
1722	sltu	$1,$3,$25
1723	addu	$7,$1
1724	mflo	$24
1725	mfhi	$25
1726	slt	$1,$25,$0
1727	addu	$7,$1
1728	multu	$15,$15		# mul_add_c(a[3],b[3],c1,c2,c3);
1729	sll	$25,1
1730	slt	$6,$24,$0
1731	addu	$25,$6
1732	sll	$24,1
1733	addu	$2,$24
1734	sltu	$1,$2,$24
1735	addu	$25,$1
1736	addu	$3,$25
1737	sltu	$1,$3,$25
1738	addu	$7,$1
1739	mflo	$24
1740	mfhi	$25
1741	addu	$2,$24
1742	sltu	$1,$2,$24
1743	 multu	$12,$11		# mul_add_c2(a[0],b[7],c2,c3,c1);
1744	addu	$25,$1
1745	addu	$3,$25
1746	sltu	$1,$3,$25
1747	addu	$7,$1
1748	sw	$2,6*4($4)
1749
1750	mflo	$24
1751	mfhi	$25
1752	slt	$2,$25,$0
1753	sll	$25,1
1754	multu	$13,$10		# mul_add_c2(a[1],b[6],c2,c3,c1);
1755	slt	$6,$24,$0
1756	addu	$25,$6
1757	sll	$24,1
1758	addu	$3,$24
1759	sltu	$1,$3,$24
1760	addu	$25,$1
1761	addu	$7,$25
1762	sltu	$1,$7,$25
1763	addu	$2,$1
1764	mflo	$24
1765	mfhi	$25
1766	slt	$1,$25,$0
1767	addu	$2,$1
1768	multu	$14,$9		# mul_add_c2(a[2],b[5],c2,c3,c1);
1769	sll	$25,1
1770	slt	$6,$24,$0
1771	addu	$25,$6
1772	sll	$24,1
1773	addu	$3,$24
1774	sltu	$1,$3,$24
1775	addu	$25,$1
1776	addu	$7,$25
1777	sltu	$1,$7,$25
1778	addu	$2,$1
1779	mflo	$24
1780	mfhi	$25
1781	slt	$1,$25,$0
1782	addu	$2,$1
1783	multu	$15,$8		# mul_add_c2(a[3],b[4],c2,c3,c1);
1784	sll	$25,1
1785	slt	$6,$24,$0
1786	addu	$25,$6
1787	sll	$24,1
1788	addu	$3,$24
1789	sltu	$1,$3,$24
1790	addu	$25,$1
1791	addu	$7,$25
1792	sltu	$1,$7,$25
1793	addu	$2,$1
1794	mflo	$24
1795	mfhi	$25
1796	slt	$1,$25,$0
1797	addu	$2,$1
1798	 multu	$11,$13		# mul_add_c2(a[7],b[1],c3,c1,c2);
1799	sll	$25,1
1800	slt	$6,$24,$0
1801	addu	$25,$6
1802	sll	$24,1
1803	addu	$3,$24
1804	sltu	$1,$3,$24
1805	addu	$25,$1
1806	addu	$7,$25
1807	sltu	$1,$7,$25
1808	addu	$2,$1
1809	sw	$3,7*4($4)
1810
1811	mflo	$24
1812	mfhi	$25
1813	slt	$3,$25,$0
1814	sll	$25,1
1815	multu	$10,$14		# mul_add_c2(a[6],b[2],c3,c1,c2);
1816	slt	$6,$24,$0
1817	addu	$25,$6
1818	sll	$24,1
1819	addu	$7,$24
1820	sltu	$1,$7,$24
1821	addu	$25,$1
1822	addu	$2,$25
1823	sltu	$1,$2,$25
1824	addu	$3,$1
1825	mflo	$24
1826	mfhi	$25
1827	slt	$1,$25,$0
1828	addu	$3,$1
1829	multu	$9,$15		# mul_add_c2(a[5],b[3],c3,c1,c2);
1830	sll	$25,1
1831	slt	$6,$24,$0
1832	addu	$25,$6
1833	sll	$24,1
1834	addu	$7,$24
1835	sltu	$1,$7,$24
1836	addu	$25,$1
1837	addu	$2,$25
1838	sltu	$1,$2,$25
1839	addu	$3,$1
1840	mflo	$24
1841	mfhi	$25
1842	slt	$1,$25,$0
1843	addu	$3,$1
1844	multu	$8,$8		# mul_add_c(a[4],b[4],c3,c1,c2);
1845	sll	$25,1
1846	slt	$6,$24,$0
1847	addu	$25,$6
1848	sll	$24,1
1849	addu	$7,$24
1850	sltu	$1,$7,$24
1851	addu	$25,$1
1852	addu	$2,$25
1853	sltu	$1,$2,$25
1854	addu	$3,$1
1855	mflo	$24
1856	mfhi	$25
1857	addu	$7,$24
1858	sltu	$1,$7,$24
1859	 multu	$14,$11		# mul_add_c2(a[2],b[7],c1,c2,c3);
1860	addu	$25,$1
1861	addu	$2,$25
1862	sltu	$1,$2,$25
1863	addu	$3,$1
1864	sw	$7,8*4($4)
1865
1866	mflo	$24
1867	mfhi	$25
1868	slt	$7,$25,$0
1869	sll	$25,1
1870	multu	$15,$10		# mul_add_c2(a[3],b[6],c1,c2,c3);
1871	slt	$6,$24,$0
1872	addu	$25,$6
1873	sll	$24,1
1874	addu	$2,$24
1875	sltu	$1,$2,$24
1876	addu	$25,$1
1877	addu	$3,$25
1878	sltu	$1,$3,$25
1879	addu	$7,$1
1880	mflo	$24
1881	mfhi	$25
1882	slt	$1,$25,$0
1883	addu	$7,$1
1884	multu	$8,$9		# mul_add_c2(a[4],b[5],c1,c2,c3);
1885	sll	$25,1
1886	slt	$6,$24,$0
1887	addu	$25,$6
1888	sll	$24,1
1889	addu	$2,$24
1890	sltu	$1,$2,$24
1891	addu	$25,$1
1892	addu	$3,$25
1893	sltu	$1,$3,$25
1894	addu	$7,$1
1895	mflo	$24
1896	mfhi	$25
1897	slt	$1,$25,$0
1898	addu	$7,$1
1899	 multu	$11,$15		# mul_add_c2(a[7],b[3],c2,c3,c1);
1900	sll	$25,1
1901	slt	$6,$24,$0
1902	addu	$25,$6
1903	sll	$24,1
1904	addu	$2,$24
1905	sltu	$1,$2,$24
1906	addu	$25,$1
1907	addu	$3,$25
1908	sltu	$1,$3,$25
1909	addu	$7,$1
1910	sw	$2,9*4($4)
1911
1912	mflo	$24
1913	mfhi	$25
1914	slt	$2,$25,$0
1915	sll	$25,1
1916	multu	$10,$8		# mul_add_c2(a[6],b[4],c2,c3,c1);
1917	slt	$6,$24,$0
1918	addu	$25,$6
1919	sll	$24,1
1920	addu	$3,$24
1921	sltu	$1,$3,$24
1922	addu	$25,$1
1923	addu	$7,$25
1924	sltu	$1,$7,$25
1925	addu	$2,$1
1926	mflo	$24
1927	mfhi	$25
1928	slt	$1,$25,$0
1929	addu	$2,$1
1930	multu	$9,$9		# mul_add_c(a[5],b[5],c2,c3,c1);
1931	sll	$25,1
1932	slt	$6,$24,$0
1933	addu	$25,$6
1934	sll	$24,1
1935	addu	$3,$24
1936	sltu	$1,$3,$24
1937	addu	$25,$1
1938	addu	$7,$25
1939	sltu	$1,$7,$25
1940	addu	$2,$1
1941	mflo	$24
1942	mfhi	$25
1943	addu	$3,$24
1944	sltu	$1,$3,$24
1945	 multu	$8,$11		# mul_add_c2(a[4],b[7],c3,c1,c2);
1946	addu	$25,$1
1947	addu	$7,$25
1948	sltu	$1,$7,$25
1949	addu	$2,$1
1950	sw	$3,10*4($4)
1951
1952	mflo	$24
1953	mfhi	$25
1954	slt	$3,$25,$0
1955	sll	$25,1
1956	multu	$9,$10		# mul_add_c2(a[5],b[6],c3,c1,c2);
1957	slt	$6,$24,$0
1958	addu	$25,$6
1959	sll	$24,1
1960	addu	$7,$24
1961	sltu	$1,$7,$24
1962	addu	$25,$1
1963	addu	$2,$25
1964	sltu	$1,$2,$25
1965	addu	$3,$1
1966	mflo	$24
1967	mfhi	$25
1968	slt	$1,$25,$0
1969	addu	$3,$1
1970	 multu	$11,$9		# mul_add_c2(a[7],b[5],c1,c2,c3);
1971	sll	$25,1
1972	slt	$6,$24,$0
1973	addu	$25,$6
1974	sll	$24,1
1975	addu	$7,$24
1976	sltu	$1,$7,$24
1977	addu	$25,$1
1978	addu	$2,$25
1979	sltu	$1,$2,$25
1980	addu	$3,$1
1981	sw	$7,11*4($4)
1982
1983	mflo	$24
1984	mfhi	$25
1985	slt	$7,$25,$0
1986	sll	$25,1
1987	multu	$10,$10		# mul_add_c(a[6],b[6],c1,c2,c3);
1988	slt	$6,$24,$0
1989	addu	$25,$6
1990	sll	$24,1
1991	addu	$2,$24
1992	sltu	$1,$2,$24
1993	addu	$25,$1
1994	addu	$3,$25
1995	sltu	$1,$3,$25
1996	addu	$7,$1
1997	mflo	$24
1998	mfhi	$25
1999	addu	$2,$24
2000	sltu	$1,$2,$24
2001	 multu	$10,$11		# mul_add_c2(a[6],b[7],c2,c3,c1);
2002	addu	$25,$1
2003	addu	$3,$25
2004	sltu	$1,$3,$25
2005	addu	$7,$1
2006	sw	$2,12*4($4)
2007
2008	mflo	$24
2009	mfhi	$25
2010	slt	$2,$25,$0
2011	sll	$25,1
2012	 multu	$11,$11		# mul_add_c(a[7],b[7],c3,c1,c2);
2013	slt	$6,$24,$0
2014	addu	$25,$6
2015	sll	$24,1
2016	addu	$3,$24
2017	sltu	$1,$3,$24
2018	addu	$25,$1
2019	addu	$7,$25
2020	sltu	$1,$7,$25
2021	addu	$2,$1
2022	sw	$3,13*4($4)
2023
2024	mflo	$24
2025	mfhi	$25
2026	addu	$7,$24
2027	sltu	$1,$7,$24
2028	addu	$25,$1
2029	addu	$2,$25
2030	sw	$7,14*4($4)
2031	sw	$2,15*4($4)
2032
2033	.set	noreorder
2034	jr	$31
2035	nop
2036.end	bn_sqr_comba8
2037
2038.align	5
2039.globl	bn_sqr_comba4
2040.ent	bn_sqr_comba4
2041bn_sqr_comba4:
2042	.set	reorder
2043	lw	$12,0($5)
2044	lw	$13,4($5)
2045	multu	$12,$12		# mul_add_c(a[0],b[0],c1,c2,c3);
2046	lw	$14,2*4($5)
2047	lw	$15,3*4($5)
2048	mflo	$2
2049	mfhi	$3
2050	sw	$2,0($4)
2051
2052	multu	$12,$13		# mul_add_c2(a[0],b[1],c2,c3,c1);
2053	mflo	$24
2054	mfhi	$25
2055	slt	$2,$25,$0
2056	sll	$25,1
2057	 multu	$14,$12		# mul_add_c2(a[2],b[0],c3,c1,c2);
2058	slt	$6,$24,$0
2059	addu	$25,$6
2060	sll	$24,1
2061	addu	$3,$24
2062	sltu	$1,$3,$24
2063	addu	$7,$25,$1
2064	sw	$3,4($4)
2065
2066	mflo	$24
2067	mfhi	$25
2068	slt	$3,$25,$0
2069	sll	$25,1
2070	multu	$13,$13		# mul_add_c(a[1],b[1],c3,c1,c2);
2071	slt	$6,$24,$0
2072	addu	$25,$6
2073	sll	$24,1
2074	addu	$7,$24
2075	sltu	$1,$7,$24
2076	addu	$25,$1
2077	addu	$2,$25
2078	sltu	$1,$2,$25
2079	addu	$3,$1
2080	mflo	$24
2081	mfhi	$25
2082	addu	$7,$24
2083	sltu	$1,$7,$24
2084	 multu	$12,$15		# mul_add_c2(a[0],b[3],c1,c2,c3);
2085	addu	$25,$1
2086	addu	$2,$25
2087	sltu	$1,$2,$25
2088	addu	$3,$1
2089	sw	$7,2*4($4)
2090
2091	mflo	$24
2092	mfhi	$25
2093	slt	$7,$25,$0
2094	sll	$25,1
2095	multu	$13,$14		# mul_add_c(a2[1],b[2],c1,c2,c3);
2096	slt	$6,$24,$0
2097	addu	$25,$6
2098	sll	$24,1
2099	addu	$2,$24
2100	sltu	$1,$2,$24
2101	addu	$25,$1
2102	addu	$3,$25
2103	sltu	$1,$3,$25
2104	addu	$7,$1
2105	mflo	$24
2106	mfhi	$25
2107	slt	$1,$25,$0
2108	addu	$7,$1
2109	 multu	$15,$13		# mul_add_c2(a[3],b[1],c2,c3,c1);
2110	sll	$25,1
2111	slt	$6,$24,$0
2112	addu	$25,$6
2113	sll	$24,1
2114	addu	$2,$24
2115	sltu	$1,$2,$24
2116	addu	$25,$1
2117	addu	$3,$25
2118	sltu	$1,$3,$25
2119	addu	$7,$1
2120	sw	$2,3*4($4)
2121
2122	mflo	$24
2123	mfhi	$25
2124	slt	$2,$25,$0
2125	sll	$25,1
2126	multu	$14,$14		# mul_add_c(a[2],b[2],c2,c3,c1);
2127	slt	$6,$24,$0
2128	addu	$25,$6
2129	sll	$24,1
2130	addu	$3,$24
2131	sltu	$1,$3,$24
2132	addu	$25,$1
2133	addu	$7,$25
2134	sltu	$1,$7,$25
2135	addu	$2,$1
2136	mflo	$24
2137	mfhi	$25
2138	addu	$3,$24
2139	sltu	$1,$3,$24
2140	 multu	$14,$15		# mul_add_c2(a[2],b[3],c3,c1,c2);
2141	addu	$25,$1
2142	addu	$7,$25
2143	sltu	$1,$7,$25
2144	addu	$2,$1
2145	sw	$3,4*4($4)
2146
2147	mflo	$24
2148	mfhi	$25
2149	slt	$3,$25,$0
2150	sll	$25,1
2151	 multu	$15,$15		# mul_add_c(a[3],b[3],c1,c2,c3);
2152	slt	$6,$24,$0
2153	addu	$25,$6
2154	sll	$24,1
2155	addu	$7,$24
2156	sltu	$1,$7,$24
2157	addu	$25,$1
2158	addu	$2,$25
2159	sltu	$1,$2,$25
2160	addu	$3,$1
2161	sw	$7,5*4($4)
2162
2163	mflo	$24
2164	mfhi	$25
2165	addu	$2,$24
2166	sltu	$1,$2,$24
2167	addu	$25,$1
2168	addu	$3,$25
2169	sw	$2,6*4($4)
2170	sw	$3,7*4($4)
2171
2172	.set	noreorder
2173	jr	$31
2174	nop
2175.end	bn_sqr_comba4
2176