1#if defined(__SUNPRO_C) && defined(__sparcv9) 2# define ABI64 /* They've said -xarch=v9 at command line */ 3#elif defined(__GNUC__) && defined(__arch64__) 4# define ABI64 /* They've said -m64 at command line */ 5#endif 6 7#ifdef ABI64 8 .register %g2,#scratch 9 .register %g3,#scratch 10# define FRAME -192 11# define BIAS 2047 12#else 13# define FRAME -96 14# define BIAS 0 15#endif 16 17.text 18.align 32 19.global OPENSSL_wipe_cpu 20.type OPENSSL_wipe_cpu,#function 21! Keep in mind that this does not excuse us from wiping the stack! 22! This routine wipes registers, but not the backing store [which 23! resides on the stack, toward lower addresses]. To facilitate for 24! stack wiping I return pointer to the top of stack of the *caller*. 25OPENSSL_wipe_cpu: 26 save %sp,FRAME,%sp 27 nop 28#ifdef __sun 29#include <sys/trap.h> 30 ta ST_CLEAN_WINDOWS 31#else 32 call .walk.reg.wins 33#endif 34 nop 35 call .PIC.zero.up 36 mov .zero-(.-4),%o0 37 ld [%o0],%f0 38 ld [%o0],%f1 39 40 subcc %g0,1,%o0 41 ! Following is V9 "rd %ccr,%o0" instruction. However! V8 42 ! specification says that it ("rd %asr2,%o0" in V8 terms) does 43 ! not cause illegal_instruction trap. It therefore can be used 44 ! to determine if the CPU the code is executing on is V8- or 45 ! V9-compliant, as V9 returns a distinct value of 0x99, 46 ! "negative" and "borrow" bits set in both %icc and %xcc. 47 .word 0x91408000 !rd %ccr,%o0 48 cmp %o0,0x99 49 bne .v8 50 nop 51 ! Even though we do not use %fp register bank, 52 ! we wipe it as memcpy might have used it... 53 .word 0xbfa00040 !fmovd %f0,%f62 54 .word 0xbba00040 !... 55 .word 0xb7a00040 56 .word 0xb3a00040 57 .word 0xafa00040 58 .word 0xaba00040 59 .word 0xa7a00040 60 .word 0xa3a00040 61 .word 0x9fa00040 62 .word 0x9ba00040 63 .word 0x97a00040 64 .word 0x93a00040 65 .word 0x8fa00040 66 .word 0x8ba00040 67 .word 0x87a00040 68 .word 0x83a00040 !fmovd %f0,%f32 69.v8: fmovs %f1,%f31 70 clr %o0 71 fmovs %f0,%f30 72 clr %o1 73 fmovs %f1,%f29 74 clr %o2 75 fmovs %f0,%f28 76 clr %o3 77 fmovs %f1,%f27 78 clr %o4 79 fmovs %f0,%f26 80 clr %o5 81 fmovs %f1,%f25 82 clr %o7 83 fmovs %f0,%f24 84 clr %l0 85 fmovs %f1,%f23 86 clr %l1 87 fmovs %f0,%f22 88 clr %l2 89 fmovs %f1,%f21 90 clr %l3 91 fmovs %f0,%f20 92 clr %l4 93 fmovs %f1,%f19 94 clr %l5 95 fmovs %f0,%f18 96 clr %l6 97 fmovs %f1,%f17 98 clr %l7 99 fmovs %f0,%f16 100 clr %i0 101 fmovs %f1,%f15 102 clr %i1 103 fmovs %f0,%f14 104 clr %i2 105 fmovs %f1,%f13 106 clr %i3 107 fmovs %f0,%f12 108 clr %i4 109 fmovs %f1,%f11 110 clr %i5 111 fmovs %f0,%f10 112 clr %g1 113 fmovs %f1,%f9 114 clr %g2 115 fmovs %f0,%f8 116 clr %g3 117 fmovs %f1,%f7 118 clr %g4 119 fmovs %f0,%f6 120 clr %g5 121 fmovs %f1,%f5 122 fmovs %f0,%f4 123 fmovs %f1,%f3 124 fmovs %f0,%f2 125 126 add %fp,BIAS,%i0 ! return pointer to caller´s top of stack 127 128 ret 129 restore 130 131.zero: .long 0x0,0x0 132.PIC.zero.up: 133 retl 134 add %o0,%o7,%o0 135#ifdef DEBUG 136.global walk_reg_wins 137.type walk_reg_wins,#function 138walk_reg_wins: 139#endif 140.walk.reg.wins: 141 save %sp,FRAME,%sp 142 cmp %i7,%o7 143 be 2f 144 clr %o0 145 cmp %o7,0 ! compiler never cleans %o7... 146 be 1f ! could have been a leaf function... 147 clr %o1 148 call .walk.reg.wins 149 nop 1501: clr %o2 151 clr %o3 152 clr %o4 153 clr %o5 154 clr %o7 155 clr %l0 156 clr %l1 157 clr %l2 158 clr %l3 159 clr %l4 160 clr %l5 161 clr %l6 162 clr %l7 163 add %o0,1,%i0 ! used for debugging 1642: ret 165 restore 166.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu 167 168.global OPENSSL_atomic_add 169.type OPENSSL_atomic_add,#function 170.align 32 171OPENSSL_atomic_add: 172#ifndef ABI64 173 subcc %g0,1,%o2 174 .word 0x95408000 !rd %ccr,%o2, see comment above 175 cmp %o2,0x99 176 be .v9 177 nop 178 save %sp,FRAME,%sp 179 ba .enter 180 nop 181#ifdef __sun 182! Note that you do not have to link with libthread to call thr_yield, 183! as libc provides a stub, which is overloaded the moment you link 184! with *either* libpthread or libthread... 185#define YIELD_CPU thr_yield 186#else 187! applies at least to Linux and FreeBSD... Feedback expected... 188#define YIELD_CPU sched_yield 189#endif 190.spin: call YIELD_CPU 191 nop 192.enter: ld [%i0],%i2 193 cmp %i2,-4096 194 be .spin 195 mov -1,%i2 196 swap [%i0],%i2 197 cmp %i2,-1 198 be .spin 199 add %i2,%i1,%i2 200 stbar 201 st %i2,[%i0] 202 sra %i2,%g0,%i0 203 ret 204 restore 205.v9: 206#endif 207 ld [%o0],%o2 2081: add %o1,%o2,%o3 209 .word 0xd7e2100a !cas [%o0],%o2,%o3, compare [%o0] with %o2 and swap %o3 210 cmp %o2,%o3 211 bne 1b 212 mov %o3,%o2 ! cas is always fetching to dest. register 213 add %o1,%o2,%o0 ! OpenSSL expects the new value 214 retl 215 sra %o0,%g0,%o0 ! we return signed int, remember? 216.size OPENSSL_atomic_add,.-OPENSSL_atomic_add 217 218.global _sparcv9_rdtick 219.align 32 220_sparcv9_rdtick: 221 subcc %g0,1,%o0 222 .word 0x91408000 !rd %ccr,%o0 223 cmp %o0,0x99 224 bne .notick 225 xor %o0,%o0,%o0 226 .word 0x91410000 !rd %tick,%o0 227 retl 228 .word 0x93323020 !srlx %o0,32,%o1 229.notick: 230 retl 231 xor %o1,%o1,%o1 232.type _sparcv9_rdtick,#function 233.size _sparcv9_rdtick,.-_sparcv9_rdtick 234 235.global _sparcv9_vis1_probe 236.align 8 237_sparcv9_vis1_probe: 238 add %sp,BIAS+2,%o1 239 .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0 240 retl 241 .word 0x81b00d80 !fxor %f0,%f0,%f0 242.type _sparcv9_vis1_probe,#function 243.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe 244 245! Probe and instrument VIS1 instruction. Output is number of cycles it 246! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit 247! is slow (documented to be 6 cycles on T2) and the core is in-order 248! single-issue, it should be possible to distinguish Tx reliably... 249! Observed return values are: 250! 251! UltraSPARC IIe 7 252! UltraSPARC III 7 253! UltraSPARC T1 24 254! 255! Numbers for T2 and SPARC64 V-VII are more than welcomed. 256! 257! It would be possible to detect specifically US-T1 by instrumenting 258! fmul8ulx16, which is emulated on T1 and as such accounts for quite 259! a lot of %tick-s, couple of thousand on Linux... 260.global _sparcv9_vis1_instrument 261.align 8 262_sparcv9_vis1_instrument: 263 .word 0x91410000 !rd %tick,%o0 264 .word 0x81b00d80 !fxor %f0,%f0,%f0 265 .word 0x85b08d82 !fxor %f2,%f2,%f2 266 .word 0x93410000 !rd %tick,%o1 267 .word 0x81b00d80 !fxor %f0,%f0,%f0 268 .word 0x85b08d82 !fxor %f2,%f2,%f2 269 .word 0x95410000 !rd %tick,%o2 270 .word 0x81b00d80 !fxor %f0,%f0,%f0 271 .word 0x85b08d82 !fxor %f2,%f2,%f2 272 .word 0x97410000 !rd %tick,%o3 273 .word 0x81b00d80 !fxor %f0,%f0,%f0 274 .word 0x85b08d82 !fxor %f2,%f2,%f2 275 .word 0x99410000 !rd %tick,%o4 276 277 ! calculate intervals 278 sub %o1,%o0,%o0 279 sub %o2,%o1,%o1 280 sub %o3,%o2,%o2 281 sub %o4,%o3,%o3 282 283 ! find minumum value 284 cmp %o0,%o1 285 .word 0x38680002 !bgu,a %xcc,.+8 286 mov %o1,%o0 287 cmp %o0,%o2 288 .word 0x38680002 !bgu,a %xcc,.+8 289 mov %o2,%o0 290 cmp %o0,%o3 291 .word 0x38680002 !bgu,a %xcc,.+8 292 mov %o3,%o0 293 294 retl 295 nop 296.type _sparcv9_vis1_instrument,#function 297.size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument 298 299.global _sparcv9_vis2_probe 300.align 8 301_sparcv9_vis2_probe: 302 retl 303 .word 0x81b00980 !bshuffle %f0,%f0,%f0 304.type _sparcv9_vis2_probe,#function 305.size _sparcv9_vis2_probe,.-_sparcv9_vis2_probe 306 307.global _sparcv9_fmadd_probe 308.align 8 309_sparcv9_fmadd_probe: 310 .word 0x81b00d80 !fxor %f0,%f0,%f0 311 .word 0x85b08d82 !fxor %f2,%f2,%f2 312 retl 313 .word 0x81b80440 !fmaddd %f0,%f0,%f2,%f0 314.type _sparcv9_fmadd_probe,#function 315.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe 316 317.global OPENSSL_cleanse 318.align 32 319OPENSSL_cleanse: 320 cmp %o1,14 321 nop 322#ifdef ABI64 323 bgu %xcc,.Lot 324#else 325 bgu .Lot 326#endif 327 cmp %o1,0 328 bne .Little 329 nop 330 retl 331 nop 332 333.Little: 334 stb %g0,[%o0] 335 subcc %o1,1,%o1 336 bnz .Little 337 add %o0,1,%o0 338 retl 339 nop 340.align 32 341.Lot: 342#ifndef ABI64 343 subcc %g0,1,%g1 344 ! see above for explanation 345 .word 0x83408000 !rd %ccr,%g1 346 cmp %g1,0x99 347 bne .v8lot 348 nop 349#endif 350 351.v9lot: andcc %o0,7,%g0 352 bz .v9aligned 353 nop 354 stb %g0,[%o0] 355 sub %o1,1,%o1 356 ba .v9lot 357 add %o0,1,%o0 358.align 16,0x01000000 359.v9aligned: 360 .word 0xc0720000 !stx %g0,[%o0] 361 sub %o1,8,%o1 362 andcc %o1,-8,%g0 363#ifdef ABI64 364 .word 0x126ffffd !bnz %xcc,.v9aligned 365#else 366 .word 0x124ffffd !bnz %icc,.v9aligned 367#endif 368 add %o0,8,%o0 369 370 cmp %o1,0 371 bne .Little 372 nop 373 retl 374 nop 375#ifndef ABI64 376.v8lot: andcc %o0,3,%g0 377 bz .v8aligned 378 nop 379 stb %g0,[%o0] 380 sub %o1,1,%o1 381 ba .v8lot 382 add %o0,1,%o0 383 nop 384.v8aligned: 385 st %g0,[%o0] 386 sub %o1,4,%o1 387 andcc %o1,-4,%g0 388 bnz .v8aligned 389 add %o0,4,%o0 390 391 cmp %o1,0 392 bne .Little 393 nop 394 retl 395 nop 396#endif 397.type OPENSSL_cleanse,#function 398.size OPENSSL_cleanse,.-OPENSSL_cleanse 399 400.section ".init",#alloc,#execinstr 401 call OPENSSL_cpuid_setup 402 nop 403