1#!/usr/bin/env perl 2 3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 4push(@INC, "${dir}perlasm", "perlasm"); 5require "x86asm.pl"; 6 7&asm_init($ARGV[0],"crypto/cpu-x86-asm"); 8 9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 10 11&function_begin("OPENSSL_ia32_cpuid"); 12 &xor ("edx","edx"); 13 &pushf (); 14 &pop ("eax"); 15 &mov ("ecx","eax"); 16 &xor ("eax",1<<21); 17 &push ("eax"); 18 &popf (); 19 &pushf (); 20 &pop ("eax"); 21 &xor ("ecx","eax"); 22 &xor ("eax","eax"); 23 &bt ("ecx",21); 24 &jnc (&label("nocpuid")); 25 &mov ("esi",&wparam(0)); 26 &mov (&DWP(8,"esi"),"eax"); # clear 3rd word 27 &cpuid (); 28 &mov ("edi","eax"); # max value for standard query level 29 30 &xor ("eax","eax"); 31 &cmp ("ebx",0x756e6547); # "Genu" 32 &setne (&LB("eax")); 33 &mov ("ebp","eax"); 34 &cmp ("edx",0x49656e69); # "ineI" 35 &setne (&LB("eax")); 36 &or ("ebp","eax"); 37 &cmp ("ecx",0x6c65746e); # "ntel" 38 &setne (&LB("eax")); 39 &or ("ebp","eax"); # 0 indicates Intel CPU 40 &jz (&label("intel")); 41 42 &cmp ("ebx",0x68747541); # "Auth" 43 &setne (&LB("eax")); 44 &mov ("esi","eax"); 45 &cmp ("edx",0x69746E65); # "enti" 46 &setne (&LB("eax")); 47 &or ("esi","eax"); 48 &cmp ("ecx",0x444D4163); # "cAMD" 49 &setne (&LB("eax")); 50 &or ("esi","eax"); # 0 indicates AMD CPU 51 &jnz (&label("intel")); 52 53 # AMD specific 54 &mov ("eax",0x80000000); 55 &cpuid (); 56 &cmp ("eax",0x80000001); 57 &jb (&label("intel")); 58 &mov ("esi","eax"); 59 &mov ("eax",0x80000001); 60 &cpuid (); 61 &or ("ebp","ecx"); 62 &and ("ebp",1<<11|1); # isolate XOP bit 63 &cmp ("esi",0x80000008); 64 &jb (&label("intel")); 65 66 &mov ("eax",0x80000008); 67 &cpuid (); 68 &movz ("esi",&LB("ecx")); # number of cores - 1 69 &inc ("esi"); # number of cores 70 71 &mov ("eax",1); 72 &xor ("ecx","ecx"); 73 &cpuid (); 74 &bt ("edx",28); 75 &jnc (&label("generic")); 76 &shr ("ebx",16); 77 &and ("ebx",0xff); 78 &cmp ("ebx","esi"); 79 &ja (&label("generic")); 80 &and ("edx",0xefffffff); # clear hyper-threading bit 81 &jmp (&label("generic")); 82 83&set_label("intel"); 84 &cmp ("edi",7); 85 &jb (&label("cacheinfo")); 86 87 &mov ("esi",&wparam(0)); 88 &mov ("eax",7); 89 &xor ("ecx","ecx"); 90 &cpuid (); 91 &mov (&DWP(8,"esi"),"ebx"); 92 93&set_label("cacheinfo"); 94 &cmp ("edi",4); 95 &mov ("edi",-1); 96 &jb (&label("nocacheinfo")); 97 98 &mov ("eax",4); 99 &mov ("ecx",0); # query L1D 100 &cpuid (); 101 &mov ("edi","eax"); 102 &shr ("edi",14); 103 &and ("edi",0xfff); # number of cores -1 per L1D 104 105&set_label("nocacheinfo"); 106 &mov ("eax",1); 107 &xor ("ecx","ecx"); 108 &cpuid (); 109 &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 110 &cmp ("ebp",0); 111 &jne (&label("notintel")); 112 &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs 113 &and (&HB("eax"),15); # familiy ID 114 &cmp (&HB("eax"),15); # P4? 115 &jne (&label("notintel")); 116 &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR 117&set_label("notintel"); 118 &bt ("edx",28); # test hyper-threading bit 119 &jnc (&label("generic")); 120 &and ("edx",0xefffffff); 121 &cmp ("edi",0); 122 &je (&label("generic")); 123 124 &or ("edx",0x10000000); 125 &shr ("ebx",16); 126 &cmp (&LB("ebx"),1); 127 &ja (&label("generic")); 128 &and ("edx",0xefffffff); # clear hyper-threading bit if not 129 130&set_label("generic"); 131 &and ("ebp",1<<11); # isolate AMD XOP flag 132 &and ("ecx",0xfffff7ff); # force 11th bit to 0 133 &mov ("esi","edx"); 134 &or ("ebp","ecx"); # merge AMD XOP flag 135 136 &bt ("ecx",27); # check OSXSAVE bit 137 &jnc (&label("clear_avx")); 138 &xor ("ecx","ecx"); 139 &data_byte(0x0f,0x01,0xd0); # xgetbv 140 &and ("eax",6); 141 &cmp ("eax",6); 142 &je (&label("done")); 143 &cmp ("eax",2); 144 &je (&label("clear_avx")); 145&set_label("clear_xmm"); 146 &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits 147 &and ("esi",0xfeffffff); # clear FXSR 148&set_label("clear_avx"); 149 &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits 150 &mov ("edi",&wparam(0)); 151 &and (&DWP(8,"edi"),0xffffffdf); # clear AVX2 152&set_label("done"); 153 &mov ("eax","esi"); 154 &mov ("edx","ebp"); 155&set_label("nocpuid"); 156&function_end("OPENSSL_ia32_cpuid"); 157 158&external_label("OPENSSL_ia32cap_P"); 159 160&function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 161 &xor ("eax","eax"); 162 &xor ("edx","edx"); 163 &picmeup("ecx","OPENSSL_ia32cap_P"); 164 &bt (&DWP(0,"ecx"),4); 165 &jnc (&label("notsc")); 166 &rdtsc (); 167&set_label("notsc"); 168 &ret (); 169&function_end_B("OPENSSL_rdtsc"); 170 171# This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host], 172# but it's safe to call it on any [supported] 32-bit platform... 173# Just check for [non-]zero return value... 174&function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 175 &picmeup("ecx","OPENSSL_ia32cap_P"); 176 &bt (&DWP(0,"ecx"),4); 177 &jnc (&label("nohalt")); # no TSC 178 179 &data_word(0x9058900e); # push %cs; pop %eax 180 &and ("eax",3); 181 &jnz (&label("nohalt")); # not enough privileges 182 183 &pushf (); 184 &pop ("eax"); 185 &bt ("eax",9); 186 &jnc (&label("nohalt")); # interrupts are disabled 187 188 &rdtsc (); 189 &push ("edx"); 190 &push ("eax"); 191 &halt (); 192 &rdtsc (); 193 194 &sub ("eax",&DWP(0,"esp")); 195 &sbb ("edx",&DWP(4,"esp")); 196 &add ("esp",8); 197 &ret (); 198 199&set_label("nohalt"); 200 &xor ("eax","eax"); 201 &xor ("edx","edx"); 202 &ret (); 203&function_end_B("OPENSSL_instrument_halt"); 204 205# Essentially there is only one use for this function. Under DJGPP: 206# 207# #include <go32.h> 208# ... 209# i=OPENSSL_far_spin(_dos_ds,0x46c); 210# ... 211# to obtain the number of spins till closest timer interrupt. 212 213&function_begin_B("OPENSSL_far_spin"); 214 &pushf (); 215 &pop ("eax"); 216 &bt ("eax",9); 217 &jnc (&label("nospin")); # interrupts are disabled 218 219 &mov ("eax",&DWP(4,"esp")); 220 &mov ("ecx",&DWP(8,"esp")); 221 &data_word (0x90d88e1e); # push %ds, mov %eax,%ds 222 &xor ("eax","eax"); 223 &mov ("edx",&DWP(0,"ecx")); 224 &jmp (&label("spin")); 225 226 &align (16); 227&set_label("spin"); 228 &inc ("eax"); 229 &cmp ("edx",&DWP(0,"ecx")); 230 &je (&label("spin")); 231 232 &data_word (0x1f909090); # pop %ds 233 &ret (); 234 235&set_label("nospin"); 236 &xor ("eax","eax"); 237 &xor ("edx","edx"); 238 &ret (); 239&function_end_B("OPENSSL_far_spin"); 240 241&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 242 &xor ("eax","eax"); 243 &xor ("edx","edx"); 244 &picmeup("ecx","OPENSSL_ia32cap_P"); 245 &mov ("ecx",&DWP(0,"ecx")); 246 &bt (&DWP(0,"ecx"),1); 247 &jnc (&label("no_x87")); 248 if ($sse2) { 249 &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits 250 &cmp ("ecx",1<<26|1<<24); 251 &jne (&label("no_sse2")); 252 &pxor ("xmm0","xmm0"); 253 &pxor ("xmm1","xmm1"); 254 &pxor ("xmm2","xmm2"); 255 &pxor ("xmm3","xmm3"); 256 &pxor ("xmm4","xmm4"); 257 &pxor ("xmm5","xmm5"); 258 &pxor ("xmm6","xmm6"); 259 &pxor ("xmm7","xmm7"); 260 &set_label("no_sse2"); 261 } 262 # just a bunch of fldz to zap the fp/mm bank followed by finit... 263 &data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b); 264&set_label("no_x87"); 265 &lea ("eax",&DWP(4,"esp")); 266 &ret (); 267&function_end_B("OPENSSL_wipe_cpu"); 268 269&function_begin_B("OPENSSL_atomic_add"); 270 &mov ("edx",&DWP(4,"esp")); # fetch the pointer, 1st arg 271 &mov ("ecx",&DWP(8,"esp")); # fetch the increment, 2nd arg 272 &push ("ebx"); 273 &nop (); 274 &mov ("eax",&DWP(0,"edx")); 275&set_label("spin"); 276 &lea ("ebx",&DWP(0,"eax","ecx")); 277 &nop (); 278 &data_word(0x1ab10ff0); # lock; cmpxchg %ebx,(%edx) # %eax is envolved and is always reloaded 279 &jne (&label("spin")); 280 &mov ("eax","ebx"); # OpenSSL expects the new value 281 &pop ("ebx"); 282 &ret (); 283&function_end_B("OPENSSL_atomic_add"); 284 285# This function can become handy under Win32 in situations when 286# we don't know which calling convention, __stdcall or __cdecl(*), 287# indirect callee is using. In C it can be deployed as 288# 289#ifdef OPENSSL_CPUID_OBJ 290# type OPENSSL_indirect_call(void *f,...); 291# ... 292# OPENSSL_indirect_call(func,[up to $max arguments]); 293#endif 294# 295# (*) it's designed to work even for __fastcall if number of 296# arguments is 1 or 2! 297&function_begin_B("OPENSSL_indirect_call"); 298 { 299 my ($max,$i)=(7,); # $max has to be chosen as 4*n-1 300 # in order to preserve eventual 301 # stack alignment 302 &push ("ebp"); 303 &mov ("ebp","esp"); 304 &sub ("esp",$max*4); 305 &mov ("ecx",&DWP(12,"ebp")); 306 &mov (&DWP(0,"esp"),"ecx"); 307 &mov ("edx",&DWP(16,"ebp")); 308 &mov (&DWP(4,"esp"),"edx"); 309 for($i=2;$i<$max;$i++) 310 { 311 # Some copies will be redundant/bogus... 312 &mov ("eax",&DWP(12+$i*4,"ebp")); 313 &mov (&DWP(0+$i*4,"esp"),"eax"); 314 } 315 &call_ptr (&DWP(8,"ebp"));# make the call... 316 &mov ("esp","ebp"); # ... and just restore the stack pointer 317 # without paying attention to what we called, 318 # (__cdecl *func) or (__stdcall *one). 319 &pop ("ebp"); 320 &ret (); 321 } 322&function_end_B("OPENSSL_indirect_call"); 323 324&function_begin_B("OPENSSL_ia32_rdrand"); 325 &mov ("ecx",8); 326&set_label("loop"); 327 &rdrand ("eax"); 328 &jc (&label("break")); 329 &loop (&label("loop")); 330&set_label("break"); 331 &cmp ("eax",0); 332 &cmove ("eax","ecx"); 333 &ret (); 334&function_end_B("OPENSSL_ia32_rdrand"); 335 336&hidden("OPENSSL_ia32cap_P"); 337 338&asm_finish(); 339