1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 Copyright (c) 2014, NVIDIA Corporation. All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in the 11 documentation and/or other materials provided with the distribution. 12 * Neither the name of the Linaro nor the 13 names of its contributors may be used to endorse or promote products 14 derived from this software without specific prior written permission. 15 16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27*/ 28 29/* Assumptions: 30 * 31 * denver, ARMv8-a, AArch64 32 * Unaligned accesses 33 * 34 */ 35 36#include <private/bionic_asm.h> 37 38/* By default we assume that the DC instruction can be used to zero 39 data blocks more efficiently. In some circumstances this might be 40 unsafe, for example in an asymmetric multiprocessor environment with 41 different DC clear lengths (neither the upper nor lower lengths are 42 safe to use). The feature can be disabled by defining DONT_USE_DC. 43 44 If code may be run in a virtualized environment, then define 45 MAYBE_VIRT. This will cause the code to cache the system register 46 values rather than re-reading them each call. */ 47 48#define dstin x0 49#define val w1 50#define count x2 51#define tmp1 x3 52#define tmp1w w3 53#define tmp2 x4 54#define tmp2w w4 55#define zva_len_x x5 56#define zva_len w5 57#define zva_bits_x x6 58 59#define A_l x7 60#define A_lw w7 61#define dst x8 62#define tmp3w w9 63 64#define QA_l q0 65 66ENTRY(memset) 67 68 mov dst, dstin /* Preserve return value. */ 69 ands A_lw, val, #255 70#ifndef DONT_USE_DC 71# b.eq .Lzero_mem 72#endif 73 orr A_lw, A_lw, A_lw, lsl #8 74 orr A_lw, A_lw, A_lw, lsl #16 75 orr A_l, A_l, A_l, lsl #32 76.Ltail_maybe_long: 77 cmp count, #256 78 b.ge .Lnot_short 79.Ltail_maybe_tiny: 80 cmp count, #15 81 b.le .Ltail15tiny 82.Ltail255: 83 ands tmp1, count, #0xC0 84 b.eq .Ltail63 85 dup v0.4s, A_lw 86 cmp tmp1w, #0x80 87 b.eq 1f 88 b.lt 2f 89 stp QA_l, QA_l, [dst], #32 90 stp QA_l, QA_l, [dst], #32 911: 92 stp QA_l, QA_l, [dst], #32 93 stp QA_l, QA_l, [dst], #32 942: 95 stp QA_l, QA_l, [dst], #32 96 stp QA_l, QA_l, [dst], #32 97.Ltail63: 98 ands tmp1, count, #0x30 99 b.eq .Ltail15 100 add dst, dst, tmp1 101 cmp tmp1w, #0x20 102 b.eq 1f 103 b.lt 2f 104 stp A_l, A_l, [dst, #-48] 1051: 106 stp A_l, A_l, [dst, #-32] 1072: 108 stp A_l, A_l, [dst, #-16] 109 110.Ltail15: 111 and count, count, #15 112 add dst, dst, count 113 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 114 ret 115 116.Ltail15tiny: 117 /* Set up to 15 bytes. Does not assume earlier memory 118 being set. */ 119 tbz count, #3, 1f 120 str A_l, [dst], #8 1211: 122 tbz count, #2, 1f 123 str A_lw, [dst], #4 1241: 125 tbz count, #1, 1f 126 strh A_lw, [dst], #2 1271: 128 tbz count, #0, 1f 129 strb A_lw, [dst] 1301: 131 ret 132 133 /* Critical loop. Start at a new cache line boundary. Assuming 134 * 64 bytes per line, this ensures the entire loop is in one line. */ 135 .p2align 6 136.Lnot_short: 137 dup v0.4s, A_lw 138 neg tmp2, dst 139 ands tmp2, tmp2, #15 140 b.eq 2f 141 /* Bring DST to 128-bit (16-byte) alignment. We know that there's 142 * more than that to set, so we simply store 16 bytes and advance by 143 * the amount required to reach alignment. */ 144 sub count, count, tmp2 145 stp A_l, A_l, [dst] 146 add dst, dst, tmp2 147 /* There may be less than 63 bytes to go now. */ 148 cmp count, #255 149 b.le .Ltail255 1502: 151 cmp count, #2097152 152 b.gt 3f 1531: 154 sub count, count, #256 1552: 156 stp QA_l, QA_l, [dst], #32 157 stp QA_l, QA_l, [dst], #32 158 stp QA_l, QA_l, [dst], #32 159 stp QA_l, QA_l, [dst], #32 160 stp QA_l, QA_l, [dst], #32 161 stp QA_l, QA_l, [dst], #32 162 stp QA_l, QA_l, [dst], #32 163 stp QA_l, QA_l, [dst], #32 164 subs count, count, #256 165 b.ge 2b 166 tst count, #0xff 167 b.ne .Ltail255 168 ret 1693: 170 sub count, count, #64 1714: 172 subs count, count, #64 173 stnp QA_l, QA_l, [dst] 174 stnp QA_l, QA_l, [dst, #32] 175 add dst, dst, #64 176 b.ge 4b 177 tst count, #0x3f 178 b.ne .Ltail63 179 ret 180 181#ifndef DONT_USE_DC 182 /* For zeroing memory, check to see if we can use the ZVA feature to 183 * zero entire 'cache' lines. */ 184.Lzero_mem: 185 mov A_l, #0 186 cmp count, #63 187 b.le .Ltail_maybe_tiny 188 neg tmp2, dst 189 ands tmp2, tmp2, #15 190 b.eq 1f 191 sub count, count, tmp2 192 stp A_l, A_l, [dst] 193 add dst, dst, tmp2 194 cmp count, #63 195 b.le .Ltail63 1961: 197 /* For zeroing small amounts of memory, it's not worth setting up 198 * the line-clear code. */ 199 cmp count, #128 200 b.lt .Lnot_short 201#ifdef MAYBE_VIRT 202 /* For efficiency when virtualized, we cache the ZVA capability. */ 203 adrp tmp2, .Lcache_clear 204 ldr zva_len, [tmp2, #:lo12:.Lcache_clear] 205 tbnz zva_len, #31, .Lnot_short 206 cbnz zva_len, .Lzero_by_line 207 mrs tmp1, dczid_el0 208 tbz tmp1, #4, 1f 209 /* ZVA not available. Remember this for next time. */ 210 mov zva_len, #~0 211 str zva_len, [tmp2, #:lo12:.Lcache_clear] 212 b .Lnot_short 2131: 214 mov tmp3w, #4 215 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 216 lsl zva_len, tmp3w, zva_len 217 str zva_len, [tmp2, #:lo12:.Lcache_clear] 218#else 219 mrs tmp1, dczid_el0 220 tbnz tmp1, #4, .Lnot_short 221 mov tmp3w, #4 222 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 223 lsl zva_len, tmp3w, zva_len 224#endif 225 226.Lzero_by_line: 227 /* Compute how far we need to go to become suitably aligned. We're 228 * already at quad-word alignment. */ 229 cmp count, zva_len_x 230 b.lt .Lnot_short /* Not enough to reach alignment. */ 231 sub zva_bits_x, zva_len_x, #1 232 neg tmp2, dst 233 ands tmp2, tmp2, zva_bits_x 234 b.eq 1f /* Already aligned. */ 235 /* Not aligned, check that there's enough to copy after alignment. */ 236 sub tmp1, count, tmp2 237 cmp tmp1, #64 238 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 239 b.lt .Lnot_short 240 /* We know that there's at least 64 bytes to zero and that it's safe 241 * to overrun by 64 bytes. */ 242 mov count, tmp1 2432: 244 stp A_l, A_l, [dst] 245 stp A_l, A_l, [dst, #16] 246 stp A_l, A_l, [dst, #32] 247 subs tmp2, tmp2, #64 248 stp A_l, A_l, [dst, #48] 249 add dst, dst, #64 250 b.ge 2b 251 /* We've overrun a bit, so adjust dst downwards. */ 252 add dst, dst, tmp2 2531: 254 sub count, count, zva_len_x 2553: 256 dc zva, dst 257 add dst, dst, zva_len_x 258 subs count, count, zva_len_x 259 b.ge 3b 260 ands count, count, zva_bits_x 261 b.ne .Ltail_maybe_long 262 ret 263END(memset) 264 265#ifdef MAYBE_VIRT 266 .bss 267 .p2align 2 268.Lcache_clear: 269 .space 4 270#endif 271#endif /* DONT_USE_DC */ 272