1/* Copyright (c) 2013-2015, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* Assumptions: 28 * 29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 30 */ 31 32#include <private/bionic_asm.h> 33 34/* To test the page crossing code path more thoroughly, compile with 35 -DTEST_PAGE_CROSS - this will force all calls through the slower 36 entry path. This option is not intended for production use. */ 37 38/* Arguments and results. */ 39#define srcin x0 40#define len x0 41 42/* Locals and temporaries. */ 43#define src x1 44#define data1 x2 45#define data2 x3 46#define has_nul1 x4 47#define has_nul2 x5 48#define tmp1 x4 49#define tmp2 x5 50#define tmp3 x6 51#define tmp4 x7 52#define zeroones x8 53 54#define L(l) .L ## l 55 56 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 57 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 58 can be done in parallel across the entire word. A faster check 59 (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives 60 false hits for characters 129..255. */ 61 62#define REP8_01 0x0101010101010101 63#define REP8_7f 0x7f7f7f7f7f7f7f7f 64#define REP8_80 0x8080808080808080 65 66#ifdef TEST_PAGE_CROSS 67# define MIN_PAGE_SIZE 15 68#else 69# define MIN_PAGE_SIZE 4096 70#endif 71 72 /* Since strings are short on average, we check the first 16 bytes 73 of the string for a NUL character. In order to do an unaligned ldp 74 safely we have to do a page cross check first. If there is a NUL 75 byte we calculate the length from the 2 8-byte words using 76 conditional select to reduce branch mispredictions (it is unlikely 77 strlen will be repeatedly called on strings with the same length). 78 79 If the string is longer than 16 bytes, we align src so don't need 80 further page cross checks, and process 32 bytes per iteration 81 using the fast NUL check. If we encounter non-ASCII characters, 82 fallback to a second loop using the full NUL check. 83 84 If the page cross check fails, we read 16 bytes from an aligned 85 address, remove any characters before the string, and continue 86 in the main loop using aligned loads. Since strings crossing a 87 page in the first 16 bytes are rare (probability of 88 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. 89 90 AArch64 systems have a minimum page size of 4k. We don't bother 91 checking for larger page sizes - the cost of setting up the correct 92 page size is just not worth the extra gain from a small reduction in 93 the cases taking the slow path. Note that we only care about 94 whether the first fetch, which may be misaligned, crosses a page 95 boundary. */ 96 97ENTRY(strlen) 98 and tmp1, srcin, MIN_PAGE_SIZE - 1 99 mov zeroones, REP8_01 100 cmp tmp1, MIN_PAGE_SIZE - 16 101 b.gt L(page_cross) 102 ldp data1, data2, [srcin] 103#ifdef __AARCH64EB__ 104 /* For big-endian, carry propagation (if the final byte in the 105 string is 0x01) means we cannot use has_nul1/2 directly. 106 Since we expect strings to be small and early-exit, 107 byte-swap the data now so has_null1/2 will be correct. */ 108 rev data1, data1 109 rev data2, data2 110#endif 111 sub tmp1, data1, zeroones 112 orr tmp2, data1, REP8_7f 113 sub tmp3, data2, zeroones 114 orr tmp4, data2, REP8_7f 115 bics has_nul1, tmp1, tmp2 116 bic has_nul2, tmp3, tmp4 117 ccmp has_nul2, 0, 0, eq 118 beq L(main_loop_entry) 119 120 /* Enter with C = has_nul1 == 0. */ 121 csel has_nul1, has_nul1, has_nul2, cc 122 mov len, 8 123 rev has_nul1, has_nul1 124 clz tmp1, has_nul1 125 csel len, xzr, len, cc 126 add len, len, tmp1, lsr 3 127 ret 128 129 /* The inner loop processes 32 bytes per iteration and uses the fast 130 NUL check. If we encounter non-ASCII characters, use a second 131 loop with the accurate NUL check. */ 132 .p2align 4 133L(main_loop_entry): 134 bic src, srcin, 15 135 sub src, src, 16 136L(main_loop): 137 ldp data1, data2, [src, 32]! 138.Lpage_cross_entry: 139 sub tmp1, data1, zeroones 140 sub tmp3, data2, zeroones 141 orr tmp2, tmp1, tmp3 142 tst tmp2, zeroones, lsl 7 143 bne 1f 144 ldp data1, data2, [src, 16] 145 sub tmp1, data1, zeroones 146 sub tmp3, data2, zeroones 147 orr tmp2, tmp1, tmp3 148 tst tmp2, zeroones, lsl 7 149 beq L(main_loop) 150 add src, src, 16 1511: 152 /* The fast check failed, so do the slower, accurate NUL check. */ 153 orr tmp2, data1, REP8_7f 154 orr tmp4, data2, REP8_7f 155 bics has_nul1, tmp1, tmp2 156 bic has_nul2, tmp3, tmp4 157 ccmp has_nul2, 0, 0, eq 158 beq L(nonascii_loop) 159 160 /* Enter with C = has_nul1 == 0. */ 161L(tail): 162#ifdef __AARCH64EB__ 163 /* For big-endian, carry propagation (if the final byte in the 164 string is 0x01) means we cannot use has_nul1/2 directly. The 165 easiest way to get the correct byte is to byte-swap the data 166 and calculate the syndrome a second time. */ 167 csel data1, data1, data2, cc 168 rev data1, data1 169 sub tmp1, data1, zeroones 170 orr tmp2, data1, REP8_7f 171 bic has_nul1, tmp1, tmp2 172#else 173 csel has_nul1, has_nul1, has_nul2, cc 174#endif 175 sub len, src, srcin 176 rev has_nul1, has_nul1 177 add tmp2, len, 8 178 clz tmp1, has_nul1 179 csel len, len, tmp2, cc 180 add len, len, tmp1, lsr 3 181 ret 182 183L(nonascii_loop): 184 ldp data1, data2, [src, 16]! 185 sub tmp1, data1, zeroones 186 orr tmp2, data1, REP8_7f 187 sub tmp3, data2, zeroones 188 orr tmp4, data2, REP8_7f 189 bics has_nul1, tmp1, tmp2 190 bic has_nul2, tmp3, tmp4 191 ccmp has_nul2, 0, 0, eq 192 bne L(tail) 193 ldp data1, data2, [src, 16]! 194 sub tmp1, data1, zeroones 195 orr tmp2, data1, REP8_7f 196 sub tmp3, data2, zeroones 197 orr tmp4, data2, REP8_7f 198 bics has_nul1, tmp1, tmp2 199 bic has_nul2, tmp3, tmp4 200 ccmp has_nul2, 0, 0, eq 201 beq L(nonascii_loop) 202 b L(tail) 203 204 /* Load 16 bytes from [srcin & ~15] and force the bytes that precede 205 srcin to 0x7f, so we ignore any NUL bytes before the string. 206 Then continue in the aligned loop. */ 207L(page_cross): 208 bic src, srcin, 15 209 ldp data1, data2, [src] 210 lsl tmp1, srcin, 3 211 mov tmp4, -1 212#ifdef __AARCH64EB__ 213 /* Big-endian. Early bytes are at MSB. */ 214 lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 215#else 216 /* Little-endian. Early bytes are at LSB. */ 217 lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 218#endif 219 orr tmp1, tmp1, REP8_80 220 orn data1, data1, tmp1 221 orn tmp2, data2, tmp1 222 tst srcin, 8 223 csel data1, data1, tmp4, eq 224 csel data2, data2, tmp2, eq 225 b L(page_cross_entry) 226 227END(strlen) 228