1dbebecc2bf00530ce09b3658641d1514d807aeeenjn
2dbebecc2bf00530ce09b3658641d1514d807aeeenjn.globl _start
3dbebecc2bf00530ce09b3658641d1514d807aeeenjn
4dbebecc2bf00530ce09b3658641d1514d807aeeenjn_start:
5dbebecc2bf00530ce09b3658641d1514d807aeeenjn        # This code tests for the fldcw "load floating point command word"
6dbebecc2bf00530ce09b3658641d1514d807aeeenjn	#   instruction.  On most x86 processors the retired_instruction
7dbebecc2bf00530ce09b3658641d1514d807aeeenjn	#   performance counter counts this as one instruction.  However,
8dbebecc2bf00530ce09b3658641d1514d807aeeenjn	#   on Pentium 4 systems it counts as two.  Therefore this can
9dbebecc2bf00530ce09b3658641d1514d807aeeenjn	#   affect BBV results on such a system.
10dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# fldcw is most often used to set the rouding mode when doing
11dbebecc2bf00530ce09b3658641d1514d807aeeenjn	#   floating point to integer conversions
12dbebecc2bf00530ce09b3658641d1514d807aeeenjn
13dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# It is encoded as "d9 /5" which means
14dbebecc2bf00530ce09b3658641d1514d807aeeenjn	#   1101 1001 xx10 1yyy
15dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# Where xx is the "mod" which will be 00, 01, or 10 indicating offset
16dbebecc2bf00530ce09b3658641d1514d807aeeenjn	#   and yyy is the register field
17dbebecc2bf00530ce09b3658641d1514d807aeeenjn
18dbebecc2bf00530ce09b3658641d1514d807aeeenjn        # these are instructions with similar encodings to fldcw
19dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# that can cause false positives if the test isn't explicit enough
20dbebecc2bf00530ce09b3658641d1514d807aeeenjnsimilar:
21dbebecc2bf00530ce09b3658641d1514d807aeeenjn        fld1   	   	       		# d9 e8
22dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldl2t				# d9 e9
23dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldl2e				# d9 ea
24dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldpi				# d9 eb
25dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldlg2				# d9 ec
26dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldln2				# d9 ed
27dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldz				# d9 ee
28dbebecc2bf00530ce09b3658641d1514d807aeeenjn
29dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# check some varied ways of calling fldcw
30dbebecc2bf00530ce09b3658641d1514d807aeeenjn
31dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# offset on stack
32dbebecc2bf00530ce09b3658641d1514d807aeeenjnstack:
33dbebecc2bf00530ce09b3658641d1514d807aeeenjn	sub	$8,%rsp			# allocate space on stack
34dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fnstcw	2(%rsp)
35dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldcw	2(%rsp)
36dbebecc2bf00530ce09b3658641d1514d807aeeenjn	add	$8,%rsp			# restore stack
37dbebecc2bf00530ce09b3658641d1514d807aeeenjn
38dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# 64-bit register
39dbebecc2bf00530ce09b3658641d1514d807aeeenjnsixtyfour_reg:
40dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fnstcw	cw
41dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%rax
42dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldcw	0(%rax)			# rax
43dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%rbx
44dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldcw	0(%rbx)			# rbx
45dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%rcx
46dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldcw	0(%rcx)			# rcx
47dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%rdx
48dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldcw	0(%rdx)			# rdx
49dbebecc2bf00530ce09b3658641d1514d807aeeenjn
50dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# 32-bit register
51c98d2af90d010d78afc547141a61ca6029240cc6vince
52c98d2af90d010d78afc547141a61ca6029240cc6vince	# Note!  The assembler that comes with SuSE 9.1
53c98d2af90d010d78afc547141a61ca6029240cc6vince	#        cannot assemble 32-bit fldcw on 64-bit systems
54c98d2af90d010d78afc547141a61ca6029240cc6vince	#        Hence the need to hand-code them
55c98d2af90d010d78afc547141a61ca6029240cc6vince
56c98d2af90d010d78afc547141a61ca6029240cc6vince
57dbebecc2bf00530ce09b3658641d1514d807aeeenjnthirtytwo_reg:
58dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fnstcw	cw
59dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%eax
60c98d2af90d010d78afc547141a61ca6029240cc6vince
61c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	0(%eax)			# eax
62c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0x28
63c98d2af90d010d78afc547141a61ca6029240cc6vince
64dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%ebx
65c98d2af90d010d78afc547141a61ca6029240cc6vince
66c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	0(%ebx)			# ebx
67c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0x2b
68c98d2af90d010d78afc547141a61ca6029240cc6vince
69dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%ecx
70dbebecc2bf00530ce09b3658641d1514d807aeeenjn
71c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	0(%ecx)			# ecx
72c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0x29
73c98d2af90d010d78afc547141a61ca6029240cc6vince
74c98d2af90d010d78afc547141a61ca6029240cc6vince	mov	$cw,%edx
75c98d2af90d010d78afc547141a61ca6029240cc6vince
76c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	0(%edx)			# edx
77c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0x2a
78c98d2af90d010d78afc547141a61ca6029240cc6vince
79dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# register + 8-bit offset
80dbebecc2bf00530ce09b3658641d1514d807aeeenjneight_bit:
81dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%eax
82dbebecc2bf00530ce09b3658641d1514d807aeeenjn	sub	$32,%eax
83dbebecc2bf00530ce09b3658641d1514d807aeeenjn
84c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	32(%eax)		# eax + 8 bit offset
85c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte 0x67,0xd9,0x68,0x20
86c98d2af90d010d78afc547141a61ca6029240cc6vince
87dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	%eax,%ebx
88c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	32(%ebx)		# ebx + 8 bit offset
89c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0x6b,0x20
90c98d2af90d010d78afc547141a61ca6029240cc6vince
91dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	%eax,%ecx
92c98d2af90d010d78afc547141a61ca6029240cc6vince
93c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	32(%ecx)		# ecx + 8 bit offset
94c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0x69,0x20
95c98d2af90d010d78afc547141a61ca6029240cc6vince
96dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	%eax,%edx
97c98d2af90d010d78afc547141a61ca6029240cc6vince
98c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	32(%edx)		# edx + 8 bit offset
99c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0x6a,0x20
100c98d2af90d010d78afc547141a61ca6029240cc6vince
101dbebecc2bf00530ce09b3658641d1514d807aeeenjn
102dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# register + 32-bit offset
103dbebecc2bf00530ce09b3658641d1514d807aeeenjnthirtytwo_bit:
104dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$cw,%eax
105dbebecc2bf00530ce09b3658641d1514d807aeeenjn	sub	$30000,%eax
106dbebecc2bf00530ce09b3658641d1514d807aeeenjn
107c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	30000(%eax)		# eax + 16 bit offset
108c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0xa8,0x30,0x75,0x00,0x00
109c98d2af90d010d78afc547141a61ca6029240cc6vince
110dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	%eax,%ebx
111c98d2af90d010d78afc547141a61ca6029240cc6vince
112c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	30000(%ebx)		# ebx + 16 bit offset
113c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0xab,0x30,0x75,0x00,0x00
114c98d2af90d010d78afc547141a61ca6029240cc6vince
115dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	%eax,%ecx
116c98d2af90d010d78afc547141a61ca6029240cc6vince
117c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	30000(%ecx)		# ecx + 16 bit offset
118c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0xa9,0x30,0x75,0x00,0x00
119c98d2af90d010d78afc547141a61ca6029240cc6vince
120dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	%eax,%edx
121c98d2af90d010d78afc547141a61ca6029240cc6vince
122c98d2af90d010d78afc547141a61ca6029240cc6vince#	fldcw	30000(%edx)		# edx + 16 bit offset
123c98d2af90d010d78afc547141a61ca6029240cc6vince	.byte	0x67,0xd9,0xaa,0x30,0x75,0x00,0x00
124c98d2af90d010d78afc547141a61ca6029240cc6vince
125dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# check an fp/integer conversion
126dbebecc2bf00530ce09b3658641d1514d807aeeenjn	# in a loop to give a bigger count
127dbebecc2bf00530ce09b3658641d1514d807aeeenjn
128dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$1024,%rcx
129dbebecc2bf00530ce09b3658641d1514d807aeeenjnbig_loop:
130dbebecc2bf00530ce09b3658641d1514d807aeeenjn
131dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldl	three			# load value onto fp stack
132dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fnstcw	saved_cw		# store control word to mem
133dbebecc2bf00530ce09b3658641d1514d807aeeenjn	movzwl	saved_cw, %eax		# load cw from mem, zero extending
134dbebecc2bf00530ce09b3658641d1514d807aeeenjn	movb	$12, %ah		# set cw for "round to zero"
135b1f90e040439f3eec6ae82d71dc0aabb725c6b30florian	movw	%ax, cw			# store back to memory
136dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldcw	cw   			# save new rounding mode
137dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fistpl	result			# save stack value as integer to mem
138dbebecc2bf00530ce09b3658641d1514d807aeeenjn	fldcw	saved_cw		# restore old cw
139dbebecc2bf00530ce09b3658641d1514d807aeeenjn
140dbebecc2bf00530ce09b3658641d1514d807aeeenjn	loop	big_loop		# loop to make the count more obvious
141dbebecc2bf00530ce09b3658641d1514d807aeeenjn
142dbebecc2bf00530ce09b3658641d1514d807aeeenjn	movl	result, %ebx		# sanity check to see if the
143dbebecc2bf00530ce09b3658641d1514d807aeeenjn	cmp	$3,%rbx			# result is the expected one
144dbebecc2bf00530ce09b3658641d1514d807aeeenjn	je	exit
145dbebecc2bf00530ce09b3658641d1514d807aeeenjn
146dbebecc2bf00530ce09b3658641d1514d807aeeenjnprint_error:
147dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov 	$1,%rax			# write syscall
148dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$1,%rdi			# stdout
149dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$error,%rsi		# string
150dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov 	$22,%rdx		# length of string
151dbebecc2bf00530ce09b3658641d1514d807aeeenjn	syscall
152dbebecc2bf00530ce09b3658641d1514d807aeeenjn
153dbebecc2bf00530ce09b3658641d1514d807aeeenjnexit:
154dbebecc2bf00530ce09b3658641d1514d807aeeenjn	xor	%rdi, %rdi		# return 0
155dbebecc2bf00530ce09b3658641d1514d807aeeenjn	mov	$60, %rax		# SYSCALL_EXIT
156dbebecc2bf00530ce09b3658641d1514d807aeeenjn	syscall
157dbebecc2bf00530ce09b3658641d1514d807aeeenjn
158dbebecc2bf00530ce09b3658641d1514d807aeeenjn
159dbebecc2bf00530ce09b3658641d1514d807aeeenjn
160dbebecc2bf00530ce09b3658641d1514d807aeeenjn.data
161dbebecc2bf00530ce09b3658641d1514d807aeeenjnsaved_cw:	.long 0
162dbebecc2bf00530ce09b3658641d1514d807aeeenjncw:  	.long	0
163dbebecc2bf00530ce09b3658641d1514d807aeeenjnresult: .long	0
164dbebecc2bf00530ce09b3658641d1514d807aeeenjnthree:	.long	0			# a floating point 3.0
165dbebecc2bf00530ce09b3658641d1514d807aeeenjn	.long	1074266112
166dbebecc2bf00530ce09b3658641d1514d807aeeenjnerror:	.asciz  "Error!  Wrong result!\n"
167