1#!/usr/bin/env perl 2 3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 4push(@INC, "${dir}perlasm", "perlasm"); 5require "x86asm.pl"; 6 7&asm_init($ARGV[0],"x86cpuid"); 8 9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 10 11&function_begin("OPENSSL_ia32_cpuid"); 12 &xor ("edx","edx"); 13 &pushf (); 14 &pop ("eax"); 15 &mov ("ecx","eax"); 16 &xor ("eax",1<<21); 17 &push ("eax"); 18 &popf (); 19 &pushf (); 20 &pop ("eax"); 21 &xor ("ecx","eax"); 22 &xor ("eax","eax"); 23 &bt ("ecx",21); 24 &jnc (&label("nocpuid")); 25 &cpuid (); 26 &mov ("edi","eax"); # max value for standard query level 27 28 &xor ("eax","eax"); 29 &cmp ("ebx",0x756e6547); # "Genu" 30 &setne (&LB("eax")); 31 &mov ("ebp","eax"); 32 &cmp ("edx",0x49656e69); # "ineI" 33 &setne (&LB("eax")); 34 &or ("ebp","eax"); 35 &cmp ("ecx",0x6c65746e); # "ntel" 36 &setne (&LB("eax")); 37 &or ("ebp","eax"); # 0 indicates Intel CPU 38 &jz (&label("intel")); 39 40 &cmp ("ebx",0x68747541); # "Auth" 41 &setne (&LB("eax")); 42 &mov ("esi","eax"); 43 &cmp ("edx",0x69746E65); # "enti" 44 &setne (&LB("eax")); 45 &or ("esi","eax"); 46 &cmp ("ecx",0x444D4163); # "cAMD" 47 &setne (&LB("eax")); 48 &or ("esi","eax"); # 0 indicates AMD CPU 49 &jnz (&label("intel")); 50 51 # AMD specific 52 &mov ("eax",0x80000000); 53 &cpuid (); 54 &cmp ("eax",0x80000001); 55 &jb (&label("intel")); 56 &mov ("esi","eax"); 57 &mov ("eax",0x80000001); 58 &cpuid (); 59 &or ("ebp","ecx"); 60 &and ("ebp",1<<11|1); # isolate XOP bit 61 &cmp ("esi",0x80000008); 62 &jb (&label("intel")); 63 64 &mov ("eax",0x80000008); 65 &cpuid (); 66 &movz ("esi",&LB("ecx")); # number of cores - 1 67 &inc ("esi"); # number of cores 68 69 &mov ("eax",1); 70 &cpuid (); 71 &bt ("edx",28); 72 &jnc (&label("generic")); 73 &shr ("ebx",16); 74 &and ("ebx",0xff); 75 &cmp ("ebx","esi"); 76 &ja (&label("generic")); 77 &and ("edx",0xefffffff); # clear hyper-threading bit 78 &jmp (&label("generic")); 79 80&set_label("intel"); 81 &cmp ("edi",4); 82 &mov ("edi",-1); 83 &jb (&label("nocacheinfo")); 84 85 &mov ("eax",4); 86 &mov ("ecx",0); # query L1D 87 &cpuid (); 88 &mov ("edi","eax"); 89 &shr ("edi",14); 90 &and ("edi",0xfff); # number of cores -1 per L1D 91 92&set_label("nocacheinfo"); 93 &mov ("eax",1); 94 &cpuid (); 95 &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 96 &cmp ("ebp",0); 97 &jne (&label("notintel")); 98 &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs 99 &and (&HB("eax"),15); # familiy ID 100 &cmp (&HB("eax"),15); # P4? 101 &jne (&label("notintel")); 102 &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR 103&set_label("notintel"); 104 &bt ("edx",28); # test hyper-threading bit 105 &jnc (&label("generic")); 106 &and ("edx",0xefffffff); 107 &cmp ("edi",0); 108 &je (&label("generic")); 109 110 &or ("edx",0x10000000); 111 &shr ("ebx",16); 112 &cmp (&LB("ebx"),1); 113 &ja (&label("generic")); 114 &and ("edx",0xefffffff); # clear hyper-threading bit if not 115 116&set_label("generic"); 117 &and ("ebp",1<<11); # isolate AMD XOP flag 118 &and ("ecx",0xfffff7ff); # force 11th bit to 0 119 &mov ("esi","edx"); 120 &or ("ebp","ecx"); # merge AMD XOP flag 121 122 &bt ("ecx",27); # check OSXSAVE bit 123 &jnc (&label("clear_avx")); 124 &xor ("ecx","ecx"); 125 &data_byte(0x0f,0x01,0xd0); # xgetbv 126 &and ("eax",6); 127 &cmp ("eax",6); 128 &je (&label("done")); 129 &cmp ("eax",2); 130 &je (&label("clear_avx")); 131&set_label("clear_xmm"); 132 &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits 133 &and ("esi",0xfeffffff); # clear FXSR 134&set_label("clear_avx"); 135 &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits 136&set_label("done"); 137 &mov ("eax","esi"); 138 &mov ("edx","ebp"); 139&set_label("nocpuid"); 140&function_end("OPENSSL_ia32_cpuid"); 141 142&external_label("OPENSSL_ia32cap_P"); 143 144&function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 145 &xor ("eax","eax"); 146 &xor ("edx","edx"); 147 &picmeup("ecx","OPENSSL_ia32cap_P"); 148 &bt (&DWP(0,"ecx"),4); 149 &jnc (&label("notsc")); 150 &rdtsc (); 151&set_label("notsc"); 152 &ret (); 153&function_end_B("OPENSSL_rdtsc"); 154 155# This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host], 156# but it's safe to call it on any [supported] 32-bit platform... 157# Just check for [non-]zero return value... 158&function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 159 &picmeup("ecx","OPENSSL_ia32cap_P"); 160 &bt (&DWP(0,"ecx"),4); 161 &jnc (&label("nohalt")); # no TSC 162 163 &data_word(0x9058900e); # push %cs; pop %eax 164 &and ("eax",3); 165 &jnz (&label("nohalt")); # not enough privileges 166 167 &pushf (); 168 &pop ("eax") 169 &bt ("eax",9); 170 &jnc (&label("nohalt")); # interrupts are disabled 171 172 &rdtsc (); 173 &push ("edx"); 174 &push ("eax"); 175 &halt (); 176 &rdtsc (); 177 178 &sub ("eax",&DWP(0,"esp")); 179 &sbb ("edx",&DWP(4,"esp")); 180 &add ("esp",8); 181 &ret (); 182 183&set_label("nohalt"); 184 &xor ("eax","eax"); 185 &xor ("edx","edx"); 186 &ret (); 187&function_end_B("OPENSSL_instrument_halt"); 188 189# Essentially there is only one use for this function. Under DJGPP: 190# 191# #include <go32.h> 192# ... 193# i=OPENSSL_far_spin(_dos_ds,0x46c); 194# ... 195# to obtain the number of spins till closest timer interrupt. 196 197&function_begin_B("OPENSSL_far_spin"); 198 &pushf (); 199 &pop ("eax") 200 &bt ("eax",9); 201 &jnc (&label("nospin")); # interrupts are disabled 202 203 &mov ("eax",&DWP(4,"esp")); 204 &mov ("ecx",&DWP(8,"esp")); 205 &data_word (0x90d88e1e); # push %ds, mov %eax,%ds 206 &xor ("eax","eax"); 207 &mov ("edx",&DWP(0,"ecx")); 208 &jmp (&label("spin")); 209 210 &align (16); 211&set_label("spin"); 212 &inc ("eax"); 213 &cmp ("edx",&DWP(0,"ecx")); 214 &je (&label("spin")); 215 216 &data_word (0x1f909090); # pop %ds 217 &ret (); 218 219&set_label("nospin"); 220 &xor ("eax","eax"); 221 &xor ("edx","edx"); 222 &ret (); 223&function_end_B("OPENSSL_far_spin"); 224 225&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 226 &xor ("eax","eax"); 227 &xor ("edx","edx"); 228 &picmeup("ecx","OPENSSL_ia32cap_P"); 229 &mov ("ecx",&DWP(0,"ecx")); 230 &bt (&DWP(0,"ecx"),1); 231 &jnc (&label("no_x87")); 232 if ($sse2) { 233 &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits 234 &cmp ("ecx",1<<26|1<<24); 235 &jne (&label("no_sse2")); 236 &pxor ("xmm0","xmm0"); 237 &pxor ("xmm1","xmm1"); 238 &pxor ("xmm2","xmm2"); 239 &pxor ("xmm3","xmm3"); 240 &pxor ("xmm4","xmm4"); 241 &pxor ("xmm5","xmm5"); 242 &pxor ("xmm6","xmm6"); 243 &pxor ("xmm7","xmm7"); 244 &set_label("no_sse2"); 245 } 246 # just a bunch of fldz to zap the fp/mm bank followed by finit... 247 &data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b); 248&set_label("no_x87"); 249 &lea ("eax",&DWP(4,"esp")); 250 &ret (); 251&function_end_B("OPENSSL_wipe_cpu"); 252 253&function_begin_B("OPENSSL_atomic_add"); 254 &mov ("edx",&DWP(4,"esp")); # fetch the pointer, 1st arg 255 &mov ("ecx",&DWP(8,"esp")); # fetch the increment, 2nd arg 256 &push ("ebx"); 257 &nop (); 258 &mov ("eax",&DWP(0,"edx")); 259&set_label("spin"); 260 &lea ("ebx",&DWP(0,"eax","ecx")); 261 &nop (); 262 &data_word(0x1ab10ff0); # lock; cmpxchg %ebx,(%edx) # %eax is envolved and is always reloaded 263 &jne (&label("spin")); 264 &mov ("eax","ebx"); # OpenSSL expects the new value 265 &pop ("ebx"); 266 &ret (); 267&function_end_B("OPENSSL_atomic_add"); 268 269# This function can become handy under Win32 in situations when 270# we don't know which calling convention, __stdcall or __cdecl(*), 271# indirect callee is using. In C it can be deployed as 272# 273#ifdef OPENSSL_CPUID_OBJ 274# type OPENSSL_indirect_call(void *f,...); 275# ... 276# OPENSSL_indirect_call(func,[up to $max arguments]); 277#endif 278# 279# (*) it's designed to work even for __fastcall if number of 280# arguments is 1 or 2! 281&function_begin_B("OPENSSL_indirect_call"); 282 { 283 my $i,$max=7; # $max has to be chosen as 4*n-1 284 # in order to preserve eventual 285 # stack alignment 286 &push ("ebp"); 287 &mov ("ebp","esp"); 288 &sub ("esp",$max*4); 289 &mov ("ecx",&DWP(12,"ebp")); 290 &mov (&DWP(0,"esp"),"ecx"); 291 &mov ("edx",&DWP(16,"ebp")); 292 &mov (&DWP(4,"esp"),"edx"); 293 for($i=2;$i<$max;$i++) 294 { 295 # Some copies will be redundant/bogus... 296 &mov ("eax",&DWP(12+$i*4,"ebp")); 297 &mov (&DWP(0+$i*4,"esp"),"eax"); 298 } 299 &call_ptr (&DWP(8,"ebp"));# make the call... 300 &mov ("esp","ebp"); # ... and just restore the stack pointer 301 # without paying attention to what we called, 302 # (__cdecl *func) or (__stdcall *one). 303 &pop ("ebp"); 304 &ret (); 305 } 306&function_end_B("OPENSSL_indirect_call"); 307 308&function_begin_B("OPENSSL_cleanse"); 309 &mov ("edx",&wparam(0)); 310 &mov ("ecx",&wparam(1)); 311 &xor ("eax","eax"); 312 &cmp ("ecx",7); 313 &jae (&label("lot")); 314 &cmp ("ecx",0); 315 &je (&label("ret")); 316&set_label("little"); 317 &mov (&BP(0,"edx"),"al"); 318 &sub ("ecx",1); 319 &lea ("edx",&DWP(1,"edx")); 320 &jnz (&label("little")); 321&set_label("ret"); 322 &ret (); 323 324&set_label("lot",16); 325 &test ("edx",3); 326 &jz (&label("aligned")); 327 &mov (&BP(0,"edx"),"al"); 328 &lea ("ecx",&DWP(-1,"ecx")); 329 &lea ("edx",&DWP(1,"edx")); 330 &jmp (&label("lot")); 331&set_label("aligned"); 332 &mov (&DWP(0,"edx"),"eax"); 333 &lea ("ecx",&DWP(-4,"ecx")); 334 &test ("ecx",-4); 335 &lea ("edx",&DWP(4,"edx")); 336 &jnz (&label("aligned")); 337 &cmp ("ecx",0); 338 &jne (&label("little")); 339 &ret (); 340&function_end_B("OPENSSL_cleanse"); 341 342&function_begin_B("OPENSSL_ia32_rdrand"); 343 &mov ("ecx",8); 344&set_label("loop"); 345 &rdrand ("eax"); 346 &jc (&label("break")); 347 &loop (&label("loop")); 348&set_label("break"); 349 &cmp ("eax",0); 350 &cmove ("eax","ecx"); 351 &ret (); 352&function_end_B("OPENSSL_ia32_rdrand"); 353 354&initseg("OPENSSL_cpuid_setup"); 355 356&asm_finish(); 357