1#if defined(__SUNPRO_C) && defined(__sparcv9)
2# define ABI64  /* They've said -xarch=v9 at command line */
3#elif defined(__GNUC__) && defined(__arch64__)
4# define ABI64  /* They've said -m64 at command line */
5#endif
6
7#ifdef ABI64
8  .register	%g2,#scratch
9  .register	%g3,#scratch
10# define	FRAME	-192
11# define	BIAS	2047
12#else
13# define	FRAME	-96
14# define	BIAS	0
15#endif
16
17.text
18.align	32
19.global	OPENSSL_wipe_cpu
20.type	OPENSSL_wipe_cpu,#function
21! Keep in mind that this does not excuse us from wiping the stack!
22! This routine wipes registers, but not the backing store [which
23! resides on the stack, toward lower addresses]. To facilitate for
24! stack wiping I return pointer to the top of stack of the *caller*.
25OPENSSL_wipe_cpu:
26	save	%sp,FRAME,%sp
27	nop
28#ifdef __sun
29#include <sys/trap.h>
30	ta	ST_CLEAN_WINDOWS
31#else
32	call	.walk.reg.wins
33#endif
34	nop
35	call	.PIC.zero.up
36	mov	.zero-(.-4),%o0
37	ld	[%o0],%f0
38	ld	[%o0],%f1
39
40	subcc	%g0,1,%o0
41	! Following is V9 "rd %ccr,%o0" instruction. However! V8
42	! specification says that it ("rd %asr2,%o0" in V8 terms) does
43	! not cause illegal_instruction trap. It therefore can be used
44	! to determine if the CPU the code is executing on is V8- or
45	! V9-compliant, as V9 returns a distinct value of 0x99,
46	! "negative" and "borrow" bits set in both %icc and %xcc.
47	.word	0x91408000	!rd	%ccr,%o0
48	cmp	%o0,0x99
49	bne	.v8
50	nop
51			! Even though we do not use %fp register bank,
52			! we wipe it as memcpy might have used it...
53			.word	0xbfa00040	!fmovd	%f0,%f62
54			.word	0xbba00040	!...
55			.word	0xb7a00040
56			.word	0xb3a00040
57			.word	0xafa00040
58			.word	0xaba00040
59			.word	0xa7a00040
60			.word	0xa3a00040
61			.word	0x9fa00040
62			.word	0x9ba00040
63			.word	0x97a00040
64			.word	0x93a00040
65			.word	0x8fa00040
66			.word	0x8ba00040
67			.word	0x87a00040
68			.word	0x83a00040	!fmovd	%f0,%f32
69.v8:			fmovs	%f1,%f31
70	clr	%o0
71			fmovs	%f0,%f30
72	clr	%o1
73			fmovs	%f1,%f29
74	clr	%o2
75			fmovs	%f0,%f28
76	clr	%o3
77			fmovs	%f1,%f27
78	clr	%o4
79			fmovs	%f0,%f26
80	clr	%o5
81			fmovs	%f1,%f25
82	clr	%o7
83			fmovs	%f0,%f24
84	clr	%l0
85			fmovs	%f1,%f23
86	clr	%l1
87			fmovs	%f0,%f22
88	clr	%l2
89			fmovs	%f1,%f21
90	clr	%l3
91			fmovs	%f0,%f20
92	clr	%l4
93			fmovs	%f1,%f19
94	clr	%l5
95			fmovs	%f0,%f18
96	clr	%l6
97			fmovs	%f1,%f17
98	clr	%l7
99			fmovs	%f0,%f16
100	clr	%i0
101			fmovs	%f1,%f15
102	clr	%i1
103			fmovs	%f0,%f14
104	clr	%i2
105			fmovs	%f1,%f13
106	clr	%i3
107			fmovs	%f0,%f12
108	clr	%i4
109			fmovs	%f1,%f11
110	clr	%i5
111			fmovs	%f0,%f10
112	clr	%g1
113			fmovs	%f1,%f9
114	clr	%g2
115			fmovs	%f0,%f8
116	clr	%g3
117			fmovs	%f1,%f7
118	clr	%g4
119			fmovs	%f0,%f6
120	clr	%g5
121			fmovs	%f1,%f5
122			fmovs	%f0,%f4
123			fmovs	%f1,%f3
124			fmovs	%f0,%f2
125
126	add	%fp,BIAS,%i0	! return pointer to caller´s top of stack
127
128	ret
129	restore
130
131.zero:	.long	0x0,0x0
132.PIC.zero.up:
133	retl
134	add	%o0,%o7,%o0
135#ifdef DEBUG
136.global	walk_reg_wins
137.type	walk_reg_wins,#function
138walk_reg_wins:
139#endif
140.walk.reg.wins:
141	save	%sp,FRAME,%sp
142	cmp	%i7,%o7
143	be	2f
144	clr	%o0
145	cmp	%o7,0	! compiler never cleans %o7...
146	be	1f	! could have been a leaf function...
147	clr	%o1
148	call	.walk.reg.wins
149	nop
1501:	clr	%o2
151	clr	%o3
152	clr	%o4
153	clr	%o5
154	clr	%o7
155	clr	%l0
156	clr	%l1
157	clr	%l2
158	clr	%l3
159	clr	%l4
160	clr	%l5
161	clr	%l6
162	clr	%l7
163	add	%o0,1,%i0	! used for debugging
1642:	ret
165	restore
166.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
167
168.global	OPENSSL_atomic_add
169.type	OPENSSL_atomic_add,#function
170.align	32
171OPENSSL_atomic_add:
172#ifndef ABI64
173	subcc	%g0,1,%o2
174	.word	0x95408000	!rd	%ccr,%o2, see comment above
175	cmp	%o2,0x99
176	be	.v9
177	nop
178	save	%sp,FRAME,%sp
179	ba	.enter
180	nop
181#ifdef __sun
182! Note that you do not have to link with libthread to call thr_yield,
183! as libc provides a stub, which is overloaded the moment you link
184! with *either* libpthread or libthread...
185#define	YIELD_CPU	thr_yield
186#else
187! applies at least to Linux and FreeBSD... Feedback expected...
188#define	YIELD_CPU	sched_yield
189#endif
190.spin:	call	YIELD_CPU
191	nop
192.enter:	ld	[%i0],%i2
193	cmp	%i2,-4096
194	be	.spin
195	mov	-1,%i2
196	swap	[%i0],%i2
197	cmp	%i2,-1
198	be	.spin
199	add	%i2,%i1,%i2
200	stbar
201	st	%i2,[%i0]
202	sra	%i2,%g0,%i0
203	ret
204	restore
205.v9:
206#endif
207	ld	[%o0],%o2
2081:	add	%o1,%o2,%o3
209	.word	0xd7e2100a	!cas [%o0],%o2,%o3, compare [%o0] with %o2 and swap %o3
210	cmp	%o2,%o3
211	bne	1b
212	mov	%o3,%o2		! cas is always fetching to dest. register
213	add	%o1,%o2,%o0	! OpenSSL expects the new value
214	retl
215	sra	%o0,%g0,%o0	! we return signed int, remember?
216.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
217
218.global	_sparcv9_rdtick
219.align	32
220_sparcv9_rdtick:
221	subcc	%g0,1,%o0
222	.word	0x91408000	!rd	%ccr,%o0
223	cmp	%o0,0x99
224	bne	.notick
225	xor	%o0,%o0,%o0
226	.word	0x91410000	!rd	%tick,%o0
227	retl
228	.word	0x93323020	!srlx	%o0,32,%o1
229.notick:
230	retl
231	xor	%o1,%o1,%o1
232.type	_sparcv9_rdtick,#function
233.size	_sparcv9_rdtick,.-_sparcv9_rdtick
234
235.global	_sparcv9_vis1_probe
236.align	8
237_sparcv9_vis1_probe:
238	add	%sp,BIAS+2,%o1
239	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
240	retl
241	.word	0x81b00d80	!fxor	%f0,%f0,%f0
242.type	_sparcv9_vis1_probe,#function
243.size	_sparcv9_vis1_probe,.-_sparcv9_vis1_probe
244
245! Probe and instrument VIS1 instruction. Output is number of cycles it
246! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit
247! is slow (documented to be 6 cycles on T2) and the core is in-order
248! single-issue, it should be possible to distinguish Tx reliably...
249! Observed return values are:
250!
251!	UltraSPARC IIe		7
252!	UltraSPARC III		7
253!	UltraSPARC T1		24
254!
255! Numbers for T2 and SPARC64 V-VII are more than welcomed.
256!
257! It would be possible to detect specifically US-T1 by instrumenting
258! fmul8ulx16, which is emulated on T1 and as such accounts for quite
259! a lot of %tick-s, couple of thousand on Linux...
260.global	_sparcv9_vis1_instrument
261.align	8
262_sparcv9_vis1_instrument:
263	.word	0x91410000	!rd	%tick,%o0
264	.word	0x81b00d80	!fxor	%f0,%f0,%f0
265	.word	0x85b08d82	!fxor	%f2,%f2,%f2
266	.word	0x93410000	!rd	%tick,%o1
267	.word	0x81b00d80	!fxor	%f0,%f0,%f0
268	.word	0x85b08d82	!fxor	%f2,%f2,%f2
269	.word	0x95410000	!rd	%tick,%o2
270	.word	0x81b00d80	!fxor	%f0,%f0,%f0
271	.word	0x85b08d82	!fxor	%f2,%f2,%f2
272	.word	0x97410000	!rd	%tick,%o3
273	.word	0x81b00d80	!fxor	%f0,%f0,%f0
274	.word	0x85b08d82	!fxor	%f2,%f2,%f2
275	.word	0x99410000	!rd	%tick,%o4
276
277	! calculate intervals
278	sub	%o1,%o0,%o0
279	sub	%o2,%o1,%o1
280	sub	%o3,%o2,%o2
281	sub	%o4,%o3,%o3
282
283	! find minumum value
284	cmp	%o0,%o1
285	.word	0x38680002	!bgu,a	%xcc,.+8
286	mov	%o1,%o0
287	cmp	%o0,%o2
288	.word	0x38680002	!bgu,a	%xcc,.+8
289	mov	%o2,%o0
290	cmp	%o0,%o3
291	.word	0x38680002	!bgu,a	%xcc,.+8
292	mov	%o3,%o0
293
294	retl
295	nop
296.type	_sparcv9_vis1_instrument,#function
297.size	_sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
298
299.global	_sparcv9_vis2_probe
300.align	8
301_sparcv9_vis2_probe:
302	retl
303	.word	0x81b00980	!bshuffle	%f0,%f0,%f0
304.type	_sparcv9_vis2_probe,#function
305.size	_sparcv9_vis2_probe,.-_sparcv9_vis2_probe
306
307.global	_sparcv9_fmadd_probe
308.align	8
309_sparcv9_fmadd_probe:
310	.word	0x81b00d80	!fxor	%f0,%f0,%f0
311	.word	0x85b08d82	!fxor	%f2,%f2,%f2
312	retl
313	.word	0x81b80440	!fmaddd	%f0,%f0,%f2,%f0
314.type	_sparcv9_fmadd_probe,#function
315.size	_sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
316
317.global	OPENSSL_cleanse
318.align	32
319OPENSSL_cleanse:
320	cmp	%o1,14
321	nop
322#ifdef ABI64
323	bgu	%xcc,.Lot
324#else
325	bgu	.Lot
326#endif
327	cmp	%o1,0
328	bne	.Little
329	nop
330	retl
331	nop
332
333.Little:
334	stb	%g0,[%o0]
335	subcc	%o1,1,%o1
336	bnz	.Little
337	add	%o0,1,%o0
338	retl
339	nop
340.align	32
341.Lot:
342#ifndef ABI64
343	subcc	%g0,1,%g1
344	! see above for explanation
345	.word	0x83408000	!rd	%ccr,%g1
346	cmp	%g1,0x99
347	bne	.v8lot
348	nop
349#endif
350
351.v9lot:	andcc	%o0,7,%g0
352	bz	.v9aligned
353	nop
354	stb	%g0,[%o0]
355	sub	%o1,1,%o1
356	ba	.v9lot
357	add	%o0,1,%o0
358.align	16,0x01000000
359.v9aligned:
360	.word	0xc0720000	!stx	%g0,[%o0]
361	sub	%o1,8,%o1
362	andcc	%o1,-8,%g0
363#ifdef ABI64
364	.word	0x126ffffd	!bnz	%xcc,.v9aligned
365#else
366	.word	0x124ffffd	!bnz	%icc,.v9aligned
367#endif
368	add	%o0,8,%o0
369
370	cmp	%o1,0
371	bne	.Little
372	nop
373	retl
374	nop
375#ifndef ABI64
376.v8lot:	andcc	%o0,3,%g0
377	bz	.v8aligned
378	nop
379	stb	%g0,[%o0]
380	sub	%o1,1,%o1
381	ba	.v8lot
382	add	%o0,1,%o0
383	nop
384.v8aligned:
385	st	%g0,[%o0]
386	sub	%o1,4,%o1
387	andcc	%o1,-4,%g0
388	bnz	.v8aligned
389	add	%o0,4,%o0
390
391	cmp	%o1,0
392	bne	.Little
393	nop
394	retl
395	nop
396#endif
397.type	OPENSSL_cleanse,#function
398.size	OPENSSL_cleanse,.-OPENSSL_cleanse
399
400.section	".init",#alloc,#execinstr
401	call	OPENSSL_cpuid_setup
402	nop
403