1#!/usr/bin/env perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC, "${dir}perlasm", "perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],"crypto/cpu-x86-asm");
8
9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
10
11&function_begin("OPENSSL_ia32_cpuid");
12	&xor	("edx","edx");
13	&pushf	();
14	&pop	("eax");
15	&mov	("ecx","eax");
16	&xor	("eax",1<<21);
17	&push	("eax");
18	&popf	();
19	&pushf	();
20	&pop	("eax");
21	&xor	("ecx","eax");
22	&xor	("eax","eax");
23	&bt	("ecx",21);
24	&jnc	(&label("nocpuid"));
25	&mov	("esi",&wparam(0));
26	&mov	(&DWP(8,"esi"),"eax");	# clear 3rd word
27	&cpuid	();
28	&mov	("edi","eax");		# max value for standard query level
29
30	&xor	("eax","eax");
31	&cmp	("ebx",0x756e6547);	# "Genu"
32	&setne	(&LB("eax"));
33	&mov	("ebp","eax");
34	&cmp	("edx",0x49656e69);	# "ineI"
35	&setne	(&LB("eax"));
36	&or	("ebp","eax");
37	&cmp	("ecx",0x6c65746e);	# "ntel"
38	&setne	(&LB("eax"));
39	&or	("ebp","eax");		# 0 indicates Intel CPU
40	&jz	(&label("intel"));
41
42	&cmp	("ebx",0x68747541);	# "Auth"
43	&setne	(&LB("eax"));
44	&mov	("esi","eax");
45	&cmp	("edx",0x69746E65);	# "enti"
46	&setne	(&LB("eax"));
47	&or	("esi","eax");
48	&cmp	("ecx",0x444D4163);	# "cAMD"
49	&setne	(&LB("eax"));
50	&or	("esi","eax");		# 0 indicates AMD CPU
51	&jnz	(&label("intel"));
52
53	# AMD specific
54	&mov	("eax",0x80000000);
55	&cpuid	();
56	&cmp	("eax",0x80000001);
57	&jb	(&label("intel"));
58	&mov	("esi","eax");
59	&mov	("eax",0x80000001);
60	&cpuid	();
61	&or	("ebp","ecx");
62	&and	("ebp",1<<11|1);	# isolate XOP bit
63	&cmp	("esi",0x80000008);
64	&jb	(&label("intel"));
65
66	&mov	("eax",0x80000008);
67	&cpuid	();
68	&movz	("esi",&LB("ecx"));	# number of cores - 1
69	&inc	("esi");		# number of cores
70
71	&mov	("eax",1);
72	&xor	("ecx","ecx");
73	&cpuid	();
74	&bt	("edx",28);
75	&jnc	(&label("generic"));
76	&shr	("ebx",16);
77	&and	("ebx",0xff);
78	&cmp	("ebx","esi");
79	&ja	(&label("generic"));
80	&and	("edx",0xefffffff);	# clear hyper-threading bit
81	&jmp	(&label("generic"));
82
83&set_label("intel");
84	&cmp	("edi",7);
85	&jb	(&label("cacheinfo"));
86
87	&mov	("esi",&wparam(0));
88	&mov	("eax",7);
89	&xor	("ecx","ecx");
90	&cpuid	();
91	&mov	(&DWP(8,"esi"),"ebx");
92
93&set_label("cacheinfo");
94	&cmp	("edi",4);
95	&mov	("edi",-1);
96	&jb	(&label("nocacheinfo"));
97
98	&mov	("eax",4);
99	&mov	("ecx",0);		# query L1D
100	&cpuid	();
101	&mov	("edi","eax");
102	&shr	("edi",14);
103	&and	("edi",0xfff);		# number of cores -1 per L1D
104
105&set_label("nocacheinfo");
106	&mov	("eax",1);
107	&xor	("ecx","ecx");
108	&cpuid	();
109	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
110	&cmp	("ebp",0);
111	&jne	(&label("notintel"));
112	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
113	&and	(&HB("eax"),15);	# familiy ID
114	&cmp	(&HB("eax"),15);	# P4?
115	&jne	(&label("notintel"));
116	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
117&set_label("notintel");
118	&bt	("edx",28);		# test hyper-threading bit
119	&jnc	(&label("generic"));
120	&and	("edx",0xefffffff);
121	&cmp	("edi",0);
122	&je	(&label("generic"));
123
124	&or	("edx",0x10000000);
125	&shr	("ebx",16);
126	&cmp	(&LB("ebx"),1);
127	&ja	(&label("generic"));
128	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
129
130&set_label("generic");
131	&and	("ebp",1<<11);		# isolate AMD XOP flag
132	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
133	&mov	("esi","edx");
134	&or	("ebp","ecx");		# merge AMD XOP flag
135
136	&bt	("ecx",27);		# check OSXSAVE bit
137	&jnc	(&label("clear_avx"));
138	&xor	("ecx","ecx");
139	&data_byte(0x0f,0x01,0xd0);	# xgetbv
140	&and	("eax",6);
141	&cmp	("eax",6);
142	&je	(&label("done"));
143	&cmp	("eax",2);
144	&je	(&label("clear_avx"));
145&set_label("clear_xmm");
146	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
147	&and	("esi",0xfeffffff);	# clear FXSR
148&set_label("clear_avx");
149	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
150	&mov	("edi",&wparam(0));
151	&and	(&DWP(8,"edi"),0xffffffdf);	# clear AVX2
152&set_label("done");
153	&mov	("eax","esi");
154	&mov	("edx","ebp");
155&set_label("nocpuid");
156&function_end("OPENSSL_ia32_cpuid");
157
158&external_label("OPENSSL_ia32cap_P");
159
160&function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
161	&xor	("eax","eax");
162	&xor	("edx","edx");
163	&picmeup("ecx","OPENSSL_ia32cap_P");
164	&bt	(&DWP(0,"ecx"),4);
165	&jnc	(&label("notsc"));
166	&rdtsc	();
167&set_label("notsc");
168	&ret	();
169&function_end_B("OPENSSL_rdtsc");
170
171# This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host],
172# but it's safe to call it on any [supported] 32-bit platform...
173# Just check for [non-]zero return value...
174&function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
175	&picmeup("ecx","OPENSSL_ia32cap_P");
176	&bt	(&DWP(0,"ecx"),4);
177	&jnc	(&label("nohalt"));	# no TSC
178
179	&data_word(0x9058900e);		# push %cs; pop %eax
180	&and	("eax",3);
181	&jnz	(&label("nohalt"));	# not enough privileges
182
183	&pushf	();
184	&pop	("eax");
185	&bt	("eax",9);
186	&jnc	(&label("nohalt"));	# interrupts are disabled
187
188	&rdtsc	();
189	&push	("edx");
190	&push	("eax");
191	&halt	();
192	&rdtsc	();
193
194	&sub	("eax",&DWP(0,"esp"));
195	&sbb	("edx",&DWP(4,"esp"));
196	&add	("esp",8);
197	&ret	();
198
199&set_label("nohalt");
200	&xor	("eax","eax");
201	&xor	("edx","edx");
202	&ret	();
203&function_end_B("OPENSSL_instrument_halt");
204
205# Essentially there is only one use for this function. Under DJGPP:
206#
207#	#include <go32.h>
208#	...
209#	i=OPENSSL_far_spin(_dos_ds,0x46c);
210#	...
211# to obtain the number of spins till closest timer interrupt.
212
213&function_begin_B("OPENSSL_far_spin");
214	&pushf	();
215	&pop	("eax");
216	&bt	("eax",9);
217	&jnc	(&label("nospin"));	# interrupts are disabled
218
219	&mov	("eax",&DWP(4,"esp"));
220	&mov	("ecx",&DWP(8,"esp"));
221	&data_word (0x90d88e1e);	# push %ds, mov %eax,%ds
222	&xor	("eax","eax");
223	&mov	("edx",&DWP(0,"ecx"));
224	&jmp	(&label("spin"));
225
226	&align	(16);
227&set_label("spin");
228	&inc	("eax");
229	&cmp	("edx",&DWP(0,"ecx"));
230	&je	(&label("spin"));
231
232	&data_word (0x1f909090);	# pop	%ds
233	&ret	();
234
235&set_label("nospin");
236	&xor	("eax","eax");
237	&xor	("edx","edx");
238	&ret	();
239&function_end_B("OPENSSL_far_spin");
240
241&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
242	&xor	("eax","eax");
243	&xor	("edx","edx");
244	&picmeup("ecx","OPENSSL_ia32cap_P");
245	&mov	("ecx",&DWP(0,"ecx"));
246	&bt	(&DWP(0,"ecx"),1);
247	&jnc	(&label("no_x87"));
248	if ($sse2) {
249		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
250		&cmp	("ecx",1<<26|1<<24);
251		&jne	(&label("no_sse2"));
252		&pxor	("xmm0","xmm0");
253		&pxor	("xmm1","xmm1");
254		&pxor	("xmm2","xmm2");
255		&pxor	("xmm3","xmm3");
256		&pxor	("xmm4","xmm4");
257		&pxor	("xmm5","xmm5");
258		&pxor	("xmm6","xmm6");
259		&pxor	("xmm7","xmm7");
260	&set_label("no_sse2");
261	}
262	# just a bunch of fldz to zap the fp/mm bank followed by finit...
263	&data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b);
264&set_label("no_x87");
265	&lea	("eax",&DWP(4,"esp"));
266	&ret	();
267&function_end_B("OPENSSL_wipe_cpu");
268
269&function_begin_B("OPENSSL_atomic_add");
270	&mov	("edx",&DWP(4,"esp"));	# fetch the pointer, 1st arg
271	&mov	("ecx",&DWP(8,"esp"));	# fetch the increment, 2nd arg
272	&push	("ebx");
273	&nop	();
274	&mov	("eax",&DWP(0,"edx"));
275&set_label("spin");
276	&lea	("ebx",&DWP(0,"eax","ecx"));
277	&nop	();
278	&data_word(0x1ab10ff0);	# lock;	cmpxchg	%ebx,(%edx)	# %eax is envolved and is always reloaded
279	&jne	(&label("spin"));
280	&mov	("eax","ebx");	# OpenSSL expects the new value
281	&pop	("ebx");
282	&ret	();
283&function_end_B("OPENSSL_atomic_add");
284
285# This function can become handy under Win32 in situations when
286# we don't know which calling convention, __stdcall or __cdecl(*),
287# indirect callee is using. In C it can be deployed as
288#
289#ifdef OPENSSL_CPUID_OBJ
290#	type OPENSSL_indirect_call(void *f,...);
291#	...
292#	OPENSSL_indirect_call(func,[up to $max arguments]);
293#endif
294#
295# (*)	it's designed to work even for __fastcall if number of
296#	arguments is 1 or 2!
297&function_begin_B("OPENSSL_indirect_call");
298	{
299	my ($max,$i)=(7,);	# $max has to be chosen as 4*n-1
300				# in order to preserve eventual
301				# stack alignment
302	&push	("ebp");
303	&mov	("ebp","esp");
304	&sub	("esp",$max*4);
305	&mov	("ecx",&DWP(12,"ebp"));
306	&mov	(&DWP(0,"esp"),"ecx");
307	&mov	("edx",&DWP(16,"ebp"));
308	&mov	(&DWP(4,"esp"),"edx");
309	for($i=2;$i<$max;$i++)
310		{
311		# Some copies will be redundant/bogus...
312		&mov	("eax",&DWP(12+$i*4,"ebp"));
313		&mov	(&DWP(0+$i*4,"esp"),"eax");
314		}
315	&call_ptr	(&DWP(8,"ebp"));# make the call...
316	&mov	("esp","ebp");	# ... and just restore the stack pointer
317				# without paying attention to what we called,
318				# (__cdecl *func) or (__stdcall *one).
319	&pop	("ebp");
320	&ret	();
321	}
322&function_end_B("OPENSSL_indirect_call");
323
324&function_begin_B("OPENSSL_ia32_rdrand");
325	&mov	("ecx",8);
326&set_label("loop");
327	&rdrand	("eax");
328	&jc	(&label("break"));
329	&loop	(&label("loop"));
330&set_label("break");
331	&cmp	("eax",0);
332	&cmove	("eax","ecx");
333	&ret	();
334&function_end_B("OPENSSL_ia32_rdrand");
335
336&hidden("OPENSSL_ia32cap_P");
337
338&asm_finish();
339