1e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng/* Copyright 2002 Andi Kleen, SuSE Labs */ 2e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 3e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include <linux/linkage.h> 4e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include <asm/dwarf2.h> 5e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include <asm/cpufeature.h> 6e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng#include <asm/alternative-asm.h> 7e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 8e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng/* 9e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * ISO C memset - set a memory block to a byte value. This function uses fast 10e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * string to get better performance than the original function. The code is 11e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * simpler and shorter than the orignal function as well. 12e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 13e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rdi destination 14e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rsi value (char) 15e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rdx count (bytes) 16e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 17e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rax original destination 18e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng */ 19e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .section .altinstr_replacement, "ax", @progbits 20e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lmemset_c: 21e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rdi,%r9 22e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rdx,%rcx 23e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng andl $7,%edx 24e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng shrq $3,%rcx 25e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* expand byte value */ 26e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movzbl %sil,%esi 27e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movabs $0x0101010101010101,%rax 28e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng imulq %rsi,%rax 29e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng rep stosq 30e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movl %edx,%ecx 31e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng rep stosb 32e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %r9,%rax 33e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng ret 34e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lmemset_e: 35e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .previous 36e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 37e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng/* 38e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * ISO C memset - set a memory block to a byte value. This function uses 39e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * enhanced rep stosb to override the fast string function. 40e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * The code is simpler and shorter than the fast string function as well. 41e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 42e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rdi destination 43e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rsi value (char) 44e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rdx count (bytes) 45e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 46e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * rax original destination 47e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng */ 48e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .section .altinstr_replacement, "ax", @progbits 49e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lmemset_c_e: 50e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rdi,%r9 51e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movb %sil,%al 52e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rdx,%rcx 53e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng rep stosb 54e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %r9,%rax 55e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng ret 56e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lmemset_e_e: 57e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .previous 58e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 59e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen ChengENTRY(memset) 60e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen ChengENTRY(__memset) 61e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng CFI_STARTPROC 62e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rdi,%r10 63e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 64e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* expand byte value */ 65e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movzbl %sil,%ecx 66e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movabs $0x0101010101010101,%rax 67e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng imulq %rcx,%rax 68e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 69e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* align dst */ 70e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movl %edi,%r9d 71e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng andl $7,%r9d 72e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jnz .Lbad_alignment 73e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng CFI_REMEMBER_STATE 74e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lafter_bad_alignment: 75e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 76e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rdx,%rcx 77e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng shrq $6,%rcx 78e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jz .Lhandle_tail 79e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 80e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .p2align 4 81e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lloop_64: 82e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng decq %rcx 83e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,(%rdi) 84e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,8(%rdi) 85e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,16(%rdi) 86e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,24(%rdi) 87e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,32(%rdi) 88e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,40(%rdi) 89e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,48(%rdi) 90e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,56(%rdi) 91e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng leaq 64(%rdi),%rdi 92e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jnz .Lloop_64 93e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 94e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* Handle tail in loops. The loops should be faster than hard 95e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng to predict jump tables. */ 96e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .p2align 4 97e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lhandle_tail: 98e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movl %edx,%ecx 99e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng andl $63&(~7),%ecx 100e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jz .Lhandle_7 101e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng shrl $3,%ecx 102e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .p2align 4 103e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lloop_8: 104e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng decl %ecx 105e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,(%rdi) 106e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng leaq 8(%rdi),%rdi 107e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jnz .Lloop_8 108e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 109e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lhandle_7: 110e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng andl $7,%edx 111e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jz .Lende 112e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .p2align 4 113e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lloop_1: 114e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng decl %edx 115e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movb %al,(%rdi) 116e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng leaq 1(%rdi),%rdi 117e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jnz .Lloop_1 118e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 119e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lende: 120e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %r10,%rax 121e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng ret 122e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 123e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng CFI_RESTORE_STATE 124e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lbad_alignment: 125e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng cmpq $7,%rdx 126e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jbe .Lhandle_7 127e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq %rax,(%rdi) /* unaligned store */ 128e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng movq $8,%r8 129e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng subq %r9,%r8 130e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng addq %r8,%rdi 131e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng subq %r8,%rdx 132e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng jmp .Lafter_bad_alignment 133e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng.Lfinal: 134e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng CFI_ENDPROC 135e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen ChengENDPROC(memset) 136e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen ChengENDPROC(__memset) 137e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng 138e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng /* Some CPUs support enhanced REP MOVSB/STOSB feature. 139e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * It is recommended to use this when possible. 140e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 141e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * If enhanced REP MOVSB/STOSB feature is not available, use fast string 142e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * instructions. 143e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 144e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * Otherwise, use original memset function. 145e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * 146e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * In .altinstructions section, ERMS feature is placed after REG_GOOD 147e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng * feature to implement the right patch order. 148e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng */ 149e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .section .altinstructions,"a" 150e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ 151e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .Lfinal-memset,.Lmemset_e-.Lmemset_c 152e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ 153e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e 154e6e8a0bd7cffcc9ae2e0e75546fb12a19213d4aeBen Cheng .previous 155