1%ifidn __OUTPUT_FORMAT__,obj
2section	code	use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4%ifdef __YASM_VERSION_ID__
5%if __YASM_VERSION_ID__ < 01010000h
6%error yasm version 1.1.0 or later needed.
7%endif
8; Yasm automatically includes .00 and complains about redefining it.
9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
10%else
11$@feat.00 equ 1
12%endif
13section	.text	code align=64
14%else
15section	.text	code
16%endif
17;extern	_OPENSSL_ia32cap_P
18global	_bn_mul_mont
19align	16
20_bn_mul_mont:
21L$_bn_mul_mont_begin:
22	push	ebp
23	push	ebx
24	push	esi
25	push	edi
26	xor	eax,eax
27	mov	edi,DWORD [40+esp]
28	cmp	edi,4
29	jl	NEAR L$000just_leave
30	lea	esi,[20+esp]
31	lea	edx,[24+esp]
32	mov	ebp,esp
33	add	edi,2
34	neg	edi
35	lea	esp,[edi*4+esp-32]
36	neg	edi
37	mov	eax,esp
38	sub	eax,edx
39	and	eax,2047
40	sub	esp,eax
41	xor	edx,esp
42	and	edx,2048
43	xor	edx,2048
44	sub	esp,edx
45	and	esp,-64
46	mov	eax,DWORD [esi]
47	mov	ebx,DWORD [4+esi]
48	mov	ecx,DWORD [8+esi]
49	mov	edx,DWORD [12+esi]
50	mov	esi,DWORD [16+esi]
51	mov	esi,DWORD [esi]
52	mov	DWORD [4+esp],eax
53	mov	DWORD [8+esp],ebx
54	mov	DWORD [12+esp],ecx
55	mov	DWORD [16+esp],edx
56	mov	DWORD [20+esp],esi
57	lea	ebx,[edi-3]
58	mov	DWORD [24+esp],ebp
59	lea	eax,[_OPENSSL_ia32cap_P]
60	bt	DWORD [eax],26
61	jnc	NEAR L$001non_sse2
62	mov	eax,-1
63	movd	mm7,eax
64	mov	esi,DWORD [8+esp]
65	mov	edi,DWORD [12+esp]
66	mov	ebp,DWORD [16+esp]
67	xor	edx,edx
68	xor	ecx,ecx
69	movd	mm4,DWORD [edi]
70	movd	mm5,DWORD [esi]
71	movd	mm3,DWORD [ebp]
72	pmuludq	mm5,mm4
73	movq	mm2,mm5
74	movq	mm0,mm5
75	pand	mm0,mm7
76	pmuludq	mm5,[20+esp]
77	pmuludq	mm3,mm5
78	paddq	mm3,mm0
79	movd	mm1,DWORD [4+ebp]
80	movd	mm0,DWORD [4+esi]
81	psrlq	mm2,32
82	psrlq	mm3,32
83	inc	ecx
84align	16
85L$0021st:
86	pmuludq	mm0,mm4
87	pmuludq	mm1,mm5
88	paddq	mm2,mm0
89	paddq	mm3,mm1
90	movq	mm0,mm2
91	pand	mm0,mm7
92	movd	mm1,DWORD [4+ecx*4+ebp]
93	paddq	mm3,mm0
94	movd	mm0,DWORD [4+ecx*4+esi]
95	psrlq	mm2,32
96	movd	DWORD [28+ecx*4+esp],mm3
97	psrlq	mm3,32
98	lea	ecx,[1+ecx]
99	cmp	ecx,ebx
100	jl	NEAR L$0021st
101	pmuludq	mm0,mm4
102	pmuludq	mm1,mm5
103	paddq	mm2,mm0
104	paddq	mm3,mm1
105	movq	mm0,mm2
106	pand	mm0,mm7
107	paddq	mm3,mm0
108	movd	DWORD [28+ecx*4+esp],mm3
109	psrlq	mm2,32
110	psrlq	mm3,32
111	paddq	mm3,mm2
112	movq	[32+ebx*4+esp],mm3
113	inc	edx
114L$003outer:
115	xor	ecx,ecx
116	movd	mm4,DWORD [edx*4+edi]
117	movd	mm5,DWORD [esi]
118	movd	mm6,DWORD [32+esp]
119	movd	mm3,DWORD [ebp]
120	pmuludq	mm5,mm4
121	paddq	mm5,mm6
122	movq	mm0,mm5
123	movq	mm2,mm5
124	pand	mm0,mm7
125	pmuludq	mm5,[20+esp]
126	pmuludq	mm3,mm5
127	paddq	mm3,mm0
128	movd	mm6,DWORD [36+esp]
129	movd	mm1,DWORD [4+ebp]
130	movd	mm0,DWORD [4+esi]
131	psrlq	mm2,32
132	psrlq	mm3,32
133	paddq	mm2,mm6
134	inc	ecx
135	dec	ebx
136L$004inner:
137	pmuludq	mm0,mm4
138	pmuludq	mm1,mm5
139	paddq	mm2,mm0
140	paddq	mm3,mm1
141	movq	mm0,mm2
142	movd	mm6,DWORD [36+ecx*4+esp]
143	pand	mm0,mm7
144	movd	mm1,DWORD [4+ecx*4+ebp]
145	paddq	mm3,mm0
146	movd	mm0,DWORD [4+ecx*4+esi]
147	psrlq	mm2,32
148	movd	DWORD [28+ecx*4+esp],mm3
149	psrlq	mm3,32
150	paddq	mm2,mm6
151	dec	ebx
152	lea	ecx,[1+ecx]
153	jnz	NEAR L$004inner
154	mov	ebx,ecx
155	pmuludq	mm0,mm4
156	pmuludq	mm1,mm5
157	paddq	mm2,mm0
158	paddq	mm3,mm1
159	movq	mm0,mm2
160	pand	mm0,mm7
161	paddq	mm3,mm0
162	movd	DWORD [28+ecx*4+esp],mm3
163	psrlq	mm2,32
164	psrlq	mm3,32
165	movd	mm6,DWORD [36+ebx*4+esp]
166	paddq	mm3,mm2
167	paddq	mm3,mm6
168	movq	[32+ebx*4+esp],mm3
169	lea	edx,[1+edx]
170	cmp	edx,ebx
171	jle	NEAR L$003outer
172	emms
173	jmp	NEAR L$005common_tail
174align	16
175L$001non_sse2:
176	mov	esi,DWORD [8+esp]
177	lea	ebp,[1+ebx]
178	mov	edi,DWORD [12+esp]
179	xor	ecx,ecx
180	mov	edx,esi
181	and	ebp,1
182	sub	edx,edi
183	lea	eax,[4+ebx*4+edi]
184	or	ebp,edx
185	mov	edi,DWORD [edi]
186	jz	NEAR L$006bn_sqr_mont
187	mov	DWORD [28+esp],eax
188	mov	eax,DWORD [esi]
189	xor	edx,edx
190align	16
191L$007mull:
192	mov	ebp,edx
193	mul	edi
194	add	ebp,eax
195	lea	ecx,[1+ecx]
196	adc	edx,0
197	mov	eax,DWORD [ecx*4+esi]
198	cmp	ecx,ebx
199	mov	DWORD [28+ecx*4+esp],ebp
200	jl	NEAR L$007mull
201	mov	ebp,edx
202	mul	edi
203	mov	edi,DWORD [20+esp]
204	add	eax,ebp
205	mov	esi,DWORD [16+esp]
206	adc	edx,0
207	imul	edi,DWORD [32+esp]
208	mov	DWORD [32+ebx*4+esp],eax
209	xor	ecx,ecx
210	mov	DWORD [36+ebx*4+esp],edx
211	mov	DWORD [40+ebx*4+esp],ecx
212	mov	eax,DWORD [esi]
213	mul	edi
214	add	eax,DWORD [32+esp]
215	mov	eax,DWORD [4+esi]
216	adc	edx,0
217	inc	ecx
218	jmp	NEAR L$0082ndmadd
219align	16
220L$0091stmadd:
221	mov	ebp,edx
222	mul	edi
223	add	ebp,DWORD [32+ecx*4+esp]
224	lea	ecx,[1+ecx]
225	adc	edx,0
226	add	ebp,eax
227	mov	eax,DWORD [ecx*4+esi]
228	adc	edx,0
229	cmp	ecx,ebx
230	mov	DWORD [28+ecx*4+esp],ebp
231	jl	NEAR L$0091stmadd
232	mov	ebp,edx
233	mul	edi
234	add	eax,DWORD [32+ebx*4+esp]
235	mov	edi,DWORD [20+esp]
236	adc	edx,0
237	mov	esi,DWORD [16+esp]
238	add	ebp,eax
239	adc	edx,0
240	imul	edi,DWORD [32+esp]
241	xor	ecx,ecx
242	add	edx,DWORD [36+ebx*4+esp]
243	mov	DWORD [32+ebx*4+esp],ebp
244	adc	ecx,0
245	mov	eax,DWORD [esi]
246	mov	DWORD [36+ebx*4+esp],edx
247	mov	DWORD [40+ebx*4+esp],ecx
248	mul	edi
249	add	eax,DWORD [32+esp]
250	mov	eax,DWORD [4+esi]
251	adc	edx,0
252	mov	ecx,1
253align	16
254L$0082ndmadd:
255	mov	ebp,edx
256	mul	edi
257	add	ebp,DWORD [32+ecx*4+esp]
258	lea	ecx,[1+ecx]
259	adc	edx,0
260	add	ebp,eax
261	mov	eax,DWORD [ecx*4+esi]
262	adc	edx,0
263	cmp	ecx,ebx
264	mov	DWORD [24+ecx*4+esp],ebp
265	jl	NEAR L$0082ndmadd
266	mov	ebp,edx
267	mul	edi
268	add	ebp,DWORD [32+ebx*4+esp]
269	adc	edx,0
270	add	ebp,eax
271	adc	edx,0
272	mov	DWORD [28+ebx*4+esp],ebp
273	xor	eax,eax
274	mov	ecx,DWORD [12+esp]
275	add	edx,DWORD [36+ebx*4+esp]
276	adc	eax,DWORD [40+ebx*4+esp]
277	lea	ecx,[4+ecx]
278	mov	DWORD [32+ebx*4+esp],edx
279	cmp	ecx,DWORD [28+esp]
280	mov	DWORD [36+ebx*4+esp],eax
281	je	NEAR L$005common_tail
282	mov	edi,DWORD [ecx]
283	mov	esi,DWORD [8+esp]
284	mov	DWORD [12+esp],ecx
285	xor	ecx,ecx
286	xor	edx,edx
287	mov	eax,DWORD [esi]
288	jmp	NEAR L$0091stmadd
289align	16
290L$006bn_sqr_mont:
291	mov	DWORD [esp],ebx
292	mov	DWORD [12+esp],ecx
293	mov	eax,edi
294	mul	edi
295	mov	DWORD [32+esp],eax
296	mov	ebx,edx
297	shr	edx,1
298	and	ebx,1
299	inc	ecx
300align	16
301L$010sqr:
302	mov	eax,DWORD [ecx*4+esi]
303	mov	ebp,edx
304	mul	edi
305	add	eax,ebp
306	lea	ecx,[1+ecx]
307	adc	edx,0
308	lea	ebp,[eax*2+ebx]
309	shr	eax,31
310	cmp	ecx,DWORD [esp]
311	mov	ebx,eax
312	mov	DWORD [28+ecx*4+esp],ebp
313	jl	NEAR L$010sqr
314	mov	eax,DWORD [ecx*4+esi]
315	mov	ebp,edx
316	mul	edi
317	add	eax,ebp
318	mov	edi,DWORD [20+esp]
319	adc	edx,0
320	mov	esi,DWORD [16+esp]
321	lea	ebp,[eax*2+ebx]
322	imul	edi,DWORD [32+esp]
323	shr	eax,31
324	mov	DWORD [32+ecx*4+esp],ebp
325	lea	ebp,[edx*2+eax]
326	mov	eax,DWORD [esi]
327	shr	edx,31
328	mov	DWORD [36+ecx*4+esp],ebp
329	mov	DWORD [40+ecx*4+esp],edx
330	mul	edi
331	add	eax,DWORD [32+esp]
332	mov	ebx,ecx
333	adc	edx,0
334	mov	eax,DWORD [4+esi]
335	mov	ecx,1
336align	16
337L$0113rdmadd:
338	mov	ebp,edx
339	mul	edi
340	add	ebp,DWORD [32+ecx*4+esp]
341	adc	edx,0
342	add	ebp,eax
343	mov	eax,DWORD [4+ecx*4+esi]
344	adc	edx,0
345	mov	DWORD [28+ecx*4+esp],ebp
346	mov	ebp,edx
347	mul	edi
348	add	ebp,DWORD [36+ecx*4+esp]
349	lea	ecx,[2+ecx]
350	adc	edx,0
351	add	ebp,eax
352	mov	eax,DWORD [ecx*4+esi]
353	adc	edx,0
354	cmp	ecx,ebx
355	mov	DWORD [24+ecx*4+esp],ebp
356	jl	NEAR L$0113rdmadd
357	mov	ebp,edx
358	mul	edi
359	add	ebp,DWORD [32+ebx*4+esp]
360	adc	edx,0
361	add	ebp,eax
362	adc	edx,0
363	mov	DWORD [28+ebx*4+esp],ebp
364	mov	ecx,DWORD [12+esp]
365	xor	eax,eax
366	mov	esi,DWORD [8+esp]
367	add	edx,DWORD [36+ebx*4+esp]
368	adc	eax,DWORD [40+ebx*4+esp]
369	mov	DWORD [32+ebx*4+esp],edx
370	cmp	ecx,ebx
371	mov	DWORD [36+ebx*4+esp],eax
372	je	NEAR L$005common_tail
373	mov	edi,DWORD [4+ecx*4+esi]
374	lea	ecx,[1+ecx]
375	mov	eax,edi
376	mov	DWORD [12+esp],ecx
377	mul	edi
378	add	eax,DWORD [32+ecx*4+esp]
379	adc	edx,0
380	mov	DWORD [32+ecx*4+esp],eax
381	xor	ebp,ebp
382	cmp	ecx,ebx
383	lea	ecx,[1+ecx]
384	je	NEAR L$012sqrlast
385	mov	ebx,edx
386	shr	edx,1
387	and	ebx,1
388align	16
389L$013sqradd:
390	mov	eax,DWORD [ecx*4+esi]
391	mov	ebp,edx
392	mul	edi
393	add	eax,ebp
394	lea	ebp,[eax*1+eax]
395	adc	edx,0
396	shr	eax,31
397	add	ebp,DWORD [32+ecx*4+esp]
398	lea	ecx,[1+ecx]
399	adc	eax,0
400	add	ebp,ebx
401	adc	eax,0
402	cmp	ecx,DWORD [esp]
403	mov	DWORD [28+ecx*4+esp],ebp
404	mov	ebx,eax
405	jle	NEAR L$013sqradd
406	mov	ebp,edx
407	add	edx,edx
408	shr	ebp,31
409	add	edx,ebx
410	adc	ebp,0
411L$012sqrlast:
412	mov	edi,DWORD [20+esp]
413	mov	esi,DWORD [16+esp]
414	imul	edi,DWORD [32+esp]
415	add	edx,DWORD [32+ecx*4+esp]
416	mov	eax,DWORD [esi]
417	adc	ebp,0
418	mov	DWORD [32+ecx*4+esp],edx
419	mov	DWORD [36+ecx*4+esp],ebp
420	mul	edi
421	add	eax,DWORD [32+esp]
422	lea	ebx,[ecx-1]
423	adc	edx,0
424	mov	ecx,1
425	mov	eax,DWORD [4+esi]
426	jmp	NEAR L$0113rdmadd
427align	16
428L$005common_tail:
429	mov	ebp,DWORD [16+esp]
430	mov	edi,DWORD [4+esp]
431	lea	esi,[32+esp]
432	mov	eax,DWORD [esi]
433	mov	ecx,ebx
434	xor	edx,edx
435align	16
436L$014sub:
437	sbb	eax,DWORD [edx*4+ebp]
438	mov	DWORD [edx*4+edi],eax
439	dec	ecx
440	mov	eax,DWORD [4+edx*4+esi]
441	lea	edx,[1+edx]
442	jge	NEAR L$014sub
443	sbb	eax,0
444align	16
445L$015copy:
446	mov	edx,DWORD [ebx*4+esi]
447	mov	ebp,DWORD [ebx*4+edi]
448	xor	edx,ebp
449	and	edx,eax
450	xor	edx,ebp
451	mov	DWORD [ebx*4+esi],ecx
452	mov	DWORD [ebx*4+edi],edx
453	dec	ebx
454	jge	NEAR L$015copy
455	mov	esp,DWORD [24+esp]
456	mov	eax,1
457L$000just_leave:
458	pop	edi
459	pop	esi
460	pop	ebx
461	pop	ebp
462	ret
463db	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
464db	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
465db	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
466db	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
467db	111,114,103,62,0
468segment	.bss
469common	_OPENSSL_ia32cap_P 16
470