1.text
2
3.global	bn_mul_mont
4.type	bn_mul_mont,%function
5
6.align	2
7bn_mul_mont:
8	stmdb	sp!,{r0,r2}		@ sp points at argument block
9	ldr	r0,[sp,#3*4]		@ load num
10	cmp	r0,#2
11	movlt	r0,#0
12	addlt	sp,sp,#2*4
13	blt	.Labrt
14
15	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
16
17	mov	r0,r0,lsl#2		@ rescale r0 for byte count
18	sub	sp,sp,r0		@ alloca(4*num)
19	sub	sp,sp,#4		@ +extra dword
20	sub	r0,r0,#4		@ "num=num-1"
21	add	r4,r2,r0		@ &bp[num-1]
22
23	add	r0,sp,r0		@ r0 to point at &tp[num-1]
24	ldr	r8,[r0,#14*4]		@ &n0
25	ldr	r2,[r2]		@ bp[0]
26	ldr	r5,[r1],#4		@ ap[0],ap++
27	ldr	r6,[r3],#4		@ np[0],np++
28	ldr	r8,[r8]		@ *n0
29	str	r4,[r0,#15*4]		@ save &bp[num]
30
31	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
32	str	r8,[r0,#14*4]		@ save n0 value
33	mul	r8,r10,r8		@ "tp[0]"*n0
34	mov	r12,#0
35	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
36	mov	r4,sp
37
38.L1st:
39	ldr	r5,[r1],#4		@ ap[j],ap++
40	mov	r10,r11
41	mov	r11,#0
42	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
43	ldr	r6,[r3],#4		@ np[j],np++
44	mov	r14,#0
45	umlal	r12,r14,r6,r8	@ np[j]*n0
46	adds	r12,r12,r10
47	str	r12,[r4],#4		@ tp[j-1]=,tp++
48	adc	r12,r14,#0
49	cmp	r4,r0
50	bne	.L1st
51
52	adds	r12,r12,r11
53	mov	r14,#0
54	adc	r14,r14,#0
55	ldr	r4,[r0,#13*4]		@ restore bp
56	str	r12,[r0]		@ tp[num-1]=
57	ldr	r8,[r0,#14*4]		@ restore n0
58	str	r14,[r0,#4]		@ tp[num]=
59
60.Louter:
61	sub	r7,r0,sp		@ "original" r0-1 value
62	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
63	sub	r3,r3,r7		@ "rewind" np to &np[1]
64	ldr	r2,[r4,#4]!		@ *(++bp)
65	ldr	r5,[r1,#-4]		@ ap[0]
66	ldr	r6,[r3,#-4]		@ np[0]
67	ldr	r10,[sp]		@ tp[0]
68	ldr	r7,[sp,#4]		@ tp[1]
69
70	mov	r11,#0
71	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
72	str	r4,[r0,#13*4]		@ save bp
73	mul	r8,r10,r8
74	mov	r12,#0
75	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
76	mov	r4,sp
77
78.Linner:
79	ldr	r5,[r1],#4		@ ap[j],ap++
80	adds	r10,r11,r7		@ +=tp[j]
81	mov	r11,#0
82	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
83	ldr	r6,[r3],#4		@ np[j],np++
84	mov	r14,#0
85	umlal	r12,r14,r6,r8	@ np[j]*n0
86	ldr	r7,[r4,#8]		@ tp[j+1]
87	adc	r11,r11,#0
88	adds	r12,r12,r10
89	str	r12,[r4],#4		@ tp[j-1]=,tp++
90	adc	r12,r14,#0
91	cmp	r4,r0
92	bne	.Linner
93
94	adds	r12,r12,r11
95	mov	r14,#0
96	adc	r14,r14,#0
97	adds	r12,r12,r7
98	adc	r14,r14,#0
99	ldr	r4,[r0,#13*4]		@ restore bp
100	ldr	r7,[r0,#15*4]		@ restore &bp[num]
101	str	r12,[r0]		@ tp[num-1]=
102	ldr	r8,[r0,#14*4]		@ restore n0
103	str	r14,[r0,#4]		@ tp[num]=
104
105	cmp	r4,r7
106	bne	.Louter
107
108	ldr	r2,[r0,#12*4]		@ pull rp
109	add	r0,r0,#4		@ r0 to point at &tp[num]
110	sub	r5,r0,sp		@ "original" num value
111	mov	r4,sp			@ "rewind" r4
112	mov	r1,r4			@ "borrow" r1
113	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
114
115	subs	r7,r7,r7		@ "clear" carry flag
116.Lsub:	ldr	r7,[r4],#4
117	ldr	r6,[r3],#4
118	sbcs	r7,r7,r6		@ tp[j]-np[j]
119	str	r7,[r2],#4		@ rp[j]=
120	teq	r4,r0		@ preserve carry
121	bne	.Lsub
122	sbcs	r14,r14,#0		@ upmost carry
123	mov	r4,sp			@ "rewind" r4
124	sub	r2,r2,r5		@ "rewind" r2
125
126	and	r1,r4,r14
127	bic	r3,r2,r14
128	orr	r1,r1,r3		@ ap=borrow?tp:rp
129
130.Lcopy:	ldr	r7,[r1],#4		@ copy or in-place refresh
131	str	sp,[r4],#4		@ zap tp
132	str	r7,[r2],#4
133	cmp	r4,r0
134	bne	.Lcopy
135
136	add	sp,r0,#4		@ skip over tp[num+1]
137	ldmia	sp!,{r4-r12,lr}		@ restore registers
138	add	sp,sp,#2*4		@ skip over {r0,r2}
139	mov	r0,#1
140.Labrt:	tst	lr,#1
141	moveq	pc,lr			@ be binary compatible with V4, yet
142	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
143.size	bn_mul_mont,.-bn_mul_mont
144.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
145.align	2
146