1e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng/* Copyright 2002 Andi Kleen, SuSE Labs */
2e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
3e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include <linux/linkage.h>
4e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include <asm/dwarf2.h>
5e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include <asm/cpufeature.h>
6e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include <asm/alternative-asm.h>
7e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
8e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng/*
9e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * ISO C memset - set a memory block to a byte value. This function uses fast
10e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * string to get better performance than the original function. The code is
11e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * simpler and shorter than the orignal function as well.
12e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng *
13e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rdi   destination
14e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rsi   value (char)
15e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rdx   count (bytes)
16e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng *
17e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rax   original destination
18e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng */
19e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.section .altinstr_replacement, "ax", @progbits
20e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lmemset_c:
21e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq %rdi,%r9
22e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq %rdx,%rcx
23e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	andl $7,%edx
24e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	shrq $3,%rcx
25e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	/* expand byte value  */
26e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movzbl %sil,%esi
27e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movabs $0x0101010101010101,%rax
28e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	imulq %rsi,%rax
29e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	rep stosq
30e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movl %edx,%ecx
31e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	rep stosb
32e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq %r9,%rax
33e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	ret
34e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lmemset_e:
35e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.previous
36e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
37e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng/*
38e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * ISO C memset - set a memory block to a byte value. This function uses
39e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * enhanced rep stosb to override the fast string function.
40e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * The code is simpler and shorter than the fast string function as well.
41e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng *
42e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rdi   destination
43e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rsi   value (char)
44e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rdx   count (bytes)
45e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng *
46e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rax   original destination
47e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng */
48e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.section .altinstr_replacement, "ax", @progbits
49e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lmemset_c_e:
50e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq %rdi,%r9
51e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movb %sil,%al
52e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq %rdx,%rcx
53e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	rep stosb
54e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq %r9,%rax
55e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	ret
56e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lmemset_e_e:
57e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.previous
58e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
59e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen ChengENTRY(memset)
60e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen ChengENTRY(__memset)
61e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	CFI_STARTPROC
62e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq %rdi,%r10
63e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
64e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	/* expand byte value  */
65e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movzbl %sil,%ecx
66e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movabs $0x0101010101010101,%rax
67e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	imulq  %rcx,%rax
68e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
69e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	/* align dst */
70e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movl  %edi,%r9d
71e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	andl  $7,%r9d
72e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jnz  .Lbad_alignment
73e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	CFI_REMEMBER_STATE
74e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lafter_bad_alignment:
75e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
76e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rdx,%rcx
77e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	shrq  $6,%rcx
78e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jz	 .Lhandle_tail
79e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
80e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.p2align 4
81e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lloop_64:
82e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	decq  %rcx
83e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,(%rdi)
84e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,8(%rdi)
85e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,16(%rdi)
86e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,24(%rdi)
87e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,32(%rdi)
88e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,40(%rdi)
89e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,48(%rdi)
90e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,56(%rdi)
91e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	leaq  64(%rdi),%rdi
92e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jnz    .Lloop_64
93e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
94e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	/* Handle tail in loops. The loops should be faster than hard
95e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	   to predict jump tables. */
96e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.p2align 4
97e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lhandle_tail:
98e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movl	%edx,%ecx
99e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	andl    $63&(~7),%ecx
100e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jz 		.Lhandle_7
101e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	shrl	$3,%ecx
102e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.p2align 4
103e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lloop_8:
104e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	decl   %ecx
105e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq  %rax,(%rdi)
106e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	leaq  8(%rdi),%rdi
107e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jnz    .Lloop_8
108e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
109e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lhandle_7:
110e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	andl	$7,%edx
111e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jz      .Lende
112e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.p2align 4
113e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lloop_1:
114e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	decl    %edx
115e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movb 	%al,(%rdi)
116e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	leaq	1(%rdi),%rdi
117e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jnz     .Lloop_1
118e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
119e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lende:
120e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq	%r10,%rax
121e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	ret
122e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
123e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	CFI_RESTORE_STATE
124e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lbad_alignment:
125e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	cmpq $7,%rdx
126e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jbe	.Lhandle_7
127e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq %rax,(%rdi)	/* unaligned store */
128e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	movq $8,%r8
129e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	subq %r9,%r8
130e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	addq %r8,%rdi
131e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	subq %r8,%rdx
132e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	jmp .Lafter_bad_alignment
133e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lfinal:
134e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	CFI_ENDPROC
135e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen ChengENDPROC(memset)
136e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen ChengENDPROC(__memset)
137e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng
138e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	/* Some CPUs support enhanced REP MOVSB/STOSB feature.
139e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 * It is recommended to use this when possible.
140e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 *
141e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
142e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 * instructions.
143e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 *
144e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 * Otherwise, use original memset function.
145e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 *
146e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
147e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng         * feature to implement the right patch order.
148e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	 */
149e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.section .altinstructions,"a"
150e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
151e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng			     .Lfinal-memset,.Lmemset_e-.Lmemset_c
152e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
153e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng			     .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
154e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng	.previous
155