memset.S revision 784609317d49e854813f1797d7a53cf7d4379643
1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*/ 27 28/* Assumptions: 29 * 30 * ARMv8-a, AArch64 31 * Unaligned accesses 32 * 33 */ 34 35#include <private/bionic_asm.h> 36 37/* By default we assume that the DC instruction can be used to zero 38 data blocks more efficiently. In some circumstances this might be 39 unsafe, for example in an asymmetric multiprocessor environment with 40 different DC clear lengths (neither the upper nor lower lengths are 41 safe to use). 42 43 If code may be run in a virtualized environment, then define 44 MAYBE_VIRT. This will cause the code to cache the system register 45 values rather than re-reading them each call. */ 46 47#define dstin x0 48#define val w1 49#define count x2 50#define dst_count x3 /* for __memset_chk */ 51#define tmp1 x3 52#define tmp1w w3 53#define tmp2 x4 54#define tmp2w w4 55#define zva_len_x x5 56#define zva_len w5 57#define zva_bits_x x6 58 59#define A_l x7 60#define A_lw w7 61#define dst x8 62#define tmp3w w9 63 64ENTRY(__memset_chk) 65 cmp count, dst_count 66 bls memset 67 68 // Preserve for accurate backtrace. 69 stp x29, x30, [sp, -16]! 70 .cfi_def_cfa_offset 16 71 .cfi_rel_offset x29, 0 72 .cfi_rel_offset x30, 8 73 74 bl __memset_chk_fail 75END(__memset_chk) 76 77ENTRY(memset) 78 79 mov dst, dstin /* Preserve return value. */ 80 ands A_lw, val, #255 81 b.eq .Lzero_mem 82 orr A_lw, A_lw, A_lw, lsl #8 83 orr A_lw, A_lw, A_lw, lsl #16 84 orr A_l, A_l, A_l, lsl #32 85.Ltail_maybe_long: 86 cmp count, #64 87 b.ge .Lnot_short 88.Ltail_maybe_tiny: 89 cmp count, #15 90 b.le .Ltail15tiny 91.Ltail63: 92 ands tmp1, count, #0x30 93 b.eq .Ltail15 94 add dst, dst, tmp1 95 cmp tmp1w, #0x20 96 b.eq 1f 97 b.lt 2f 98 stp A_l, A_l, [dst, #-48] 991: 100 stp A_l, A_l, [dst, #-32] 1012: 102 stp A_l, A_l, [dst, #-16] 103 104.Ltail15: 105 and count, count, #15 106 add dst, dst, count 107 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 108 ret 109 110.Ltail15tiny: 111 /* Set up to 15 bytes. Does not assume earlier memory 112 being set. */ 113 tbz count, #3, 1f 114 str A_l, [dst], #8 1151: 116 tbz count, #2, 1f 117 str A_lw, [dst], #4 1181: 119 tbz count, #1, 1f 120 strh A_lw, [dst], #2 1211: 122 tbz count, #0, 1f 123 strb A_lw, [dst] 1241: 125 ret 126 127 /* Critical loop. Start at a new cache line boundary. Assuming 128 * 64 bytes per line, this ensures the entire loop is in one line. */ 129 .p2align 6 130.Lnot_short: 131 neg tmp2, dst 132 ands tmp2, tmp2, #15 133 b.eq 2f 134 /* Bring DST to 128-bit (16-byte) alignment. We know that there's 135 * more than that to set, so we simply store 16 bytes and advance by 136 * the amount required to reach alignment. */ 137 sub count, count, tmp2 138 stp A_l, A_l, [dst] 139 add dst, dst, tmp2 140 /* There may be less than 63 bytes to go now. */ 141 cmp count, #63 142 b.le .Ltail63 1432: 144 sub dst, dst, #16 /* Pre-bias. */ 145 sub count, count, #64 1461: 147 stp A_l, A_l, [dst, #16] 148 stp A_l, A_l, [dst, #32] 149 stp A_l, A_l, [dst, #48] 150 stp A_l, A_l, [dst, #64]! 151 subs count, count, #64 152 b.ge 1b 153 tst count, #0x3f 154 add dst, dst, #16 155 b.ne .Ltail63 156 ret 157 158 /* For zeroing memory, check to see if we can use the ZVA feature to 159 * zero entire 'cache' lines. */ 160.Lzero_mem: 161 mov A_l, #0 162 cmp count, #63 163 b.le .Ltail_maybe_tiny 164 neg tmp2, dst 165 ands tmp2, tmp2, #15 166 b.eq 1f 167 sub count, count, tmp2 168 stp A_l, A_l, [dst] 169 add dst, dst, tmp2 170 cmp count, #63 171 b.le .Ltail63 1721: 173 /* For zeroing small amounts of memory, it's not worth setting up 174 * the line-clear code. */ 175 cmp count, #128 176 b.lt .Lnot_short 177#ifdef MAYBE_VIRT 178 /* For efficiency when virtualized, we cache the ZVA capability. */ 179 adrp tmp2, .Lcache_clear 180 ldr zva_len, [tmp2, #:lo12:.Lcache_clear] 181 tbnz zva_len, #31, .Lnot_short 182 cbnz zva_len, .Lzero_by_line 183 mrs tmp1, dczid_el0 184 tbz tmp1, #4, 1f 185 /* ZVA not available. Remember this for next time. */ 186 mov zva_len, #~0 187 str zva_len, [tmp2, #:lo12:.Lcache_clear] 188 b .Lnot_short 1891: 190 mov tmp3w, #4 191 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 192 lsl zva_len, tmp3w, zva_len 193 str zva_len, [tmp2, #:lo12:.Lcache_clear] 194#else 195 mrs tmp1, dczid_el0 196 tbnz tmp1, #4, .Lnot_short 197 mov tmp3w, #4 198 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 199 lsl zva_len, tmp3w, zva_len 200#endif 201 202.Lzero_by_line: 203 /* Compute how far we need to go to become suitably aligned. We're 204 * already at quad-word alignment. */ 205 cmp count, zva_len_x 206 b.lt .Lnot_short /* Not enough to reach alignment. */ 207 sub zva_bits_x, zva_len_x, #1 208 neg tmp2, dst 209 ands tmp2, tmp2, zva_bits_x 210 b.eq 1f /* Already aligned. */ 211 /* Not aligned, check that there's enough to copy after alignment. */ 212 sub tmp1, count, tmp2 213 cmp tmp1, #64 214 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 215 b.lt .Lnot_short 216 /* We know that there's at least 64 bytes to zero and that it's safe 217 * to overrun by 64 bytes. */ 218 mov count, tmp1 2192: 220 stp A_l, A_l, [dst] 221 stp A_l, A_l, [dst, #16] 222 stp A_l, A_l, [dst, #32] 223 subs tmp2, tmp2, #64 224 stp A_l, A_l, [dst, #48] 225 add dst, dst, #64 226 b.ge 2b 227 /* We've overrun a bit, so adjust dst downwards. */ 228 add dst, dst, tmp2 2291: 230 sub count, count, zva_len_x 2313: 232 dc zva, dst 233 add dst, dst, zva_len_x 234 subs count, count, zva_len_x 235 b.ge 3b 236 ands count, count, zva_bits_x 237 b.ne .Ltail_maybe_long 238 ret 239END(memset) 240 241#ifdef MAYBE_VIRT 242 .bss 243 .p2align 2 244.Lcache_clear: 245 .space 4 246#endif 247