1/* Copyright (c) 2014, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*/ 27 28/* Assumptions: 29 * 30 * ARMv8-a, AArch64 31 */ 32 33#include <private/bionic_asm.h> 34 35/* Arguments and results. */ 36#define srcin x0 37#define len x0 38#define limit x1 39 40/* Locals and temporaries. */ 41#define src x2 42#define data1 x3 43#define data2 x4 44#define data2a x5 45#define has_nul1 x6 46#define has_nul2 x7 47#define tmp1 x8 48#define tmp2 x9 49#define tmp3 x10 50#define tmp4 x11 51#define zeroones x12 52#define pos x13 53#define limit_wd x14 54 55#define REP8_01 0x0101010101010101 56#define REP8_7f 0x7f7f7f7f7f7f7f7f 57#define REP8_80 0x8080808080808080 58 59 .text 60 .p2align 6 61.Lstart: 62 /* Pre-pad to ensure critical loop begins an icache line. */ 63 .rep 7 64 nop 65 .endr 66 /* Put this code here to avoid wasting more space with pre-padding. */ 67.Lhit_limit: 68 mov len, limit 69 ret 70 71ENTRY(strnlen) 72 cbz limit, .Lhit_limit 73 mov zeroones, #REP8_01 74 bic src, srcin, #15 75 ands tmp1, srcin, #15 76 b.ne .Lmisaligned 77 /* Calculate the number of full and partial words -1. */ 78 sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ 79 lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ 80 81 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 82 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 83 can be done in parallel across the entire word. */ 84 /* The inner loop deals with two Dwords at a time. This has a 85 slightly higher start-up cost, but we should win quite quickly, 86 especially on cores with a high number of issue slots per 87 cycle, as we get much better parallelism out of the operations. */ 88 89 /* Start of critial section -- keep to one 64Byte cache line. */ 90.Lloop: 91 ldp data1, data2, [src], #16 92.Lrealigned: 93 sub tmp1, data1, zeroones 94 orr tmp2, data1, #REP8_7f 95 sub tmp3, data2, zeroones 96 orr tmp4, data2, #REP8_7f 97 bic has_nul1, tmp1, tmp2 98 bic has_nul2, tmp3, tmp4 99 subs limit_wd, limit_wd, #1 100 orr tmp1, has_nul1, has_nul2 101 ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ 102 b.eq .Lloop 103 /* End of critical section -- keep to one 64Byte cache line. */ 104 105 orr tmp1, has_nul1, has_nul2 106 cbz tmp1, .Lhit_limit /* No null in final Qword. */ 107 108 /* We know there's a null in the final Qword. The easiest thing 109 to do now is work out the length of the string and return 110 MIN (len, limit). */ 111 112 sub len, src, srcin 113 cbz has_nul1, .Lnul_in_data2 114#ifdef __AARCH64EB__ 115 mov data2, data1 116#endif 117 sub len, len, #8 118 mov has_nul2, has_nul1 119.Lnul_in_data2: 120#ifdef __AARCH64EB__ 121 /* For big-endian, carry propagation (if the final byte in the 122 string is 0x01) means we cannot use has_nul directly. The 123 easiest way to get the correct byte is to byte-swap the data 124 and calculate the syndrome a second time. */ 125 rev data2, data2 126 sub tmp1, data2, zeroones 127 orr tmp2, data2, #REP8_7f 128 bic has_nul2, tmp1, tmp2 129#endif 130 sub len, len, #8 131 rev has_nul2, has_nul2 132 clz pos, has_nul2 133 add len, len, pos, lsr #3 /* Bits to bytes. */ 134 cmp len, limit 135 csel len, len, limit, ls /* Return the lower value. */ 136 ret 137 138.Lmisaligned: 139 /* Deal with a partial first word. 140 We're doing two things in parallel here; 141 1) Calculate the number of words (but avoiding overflow if 142 limit is near ULONG_MAX) - to do this we need to work out 143 limit + tmp1 - 1 as a 65-bit value before shifting it; 144 2) Load and mask the initial data words - we force the bytes 145 before the ones we are interested in to 0xff - this ensures 146 early bytes will not hit any zero detection. */ 147 sub limit_wd, limit, #1 148 neg tmp4, tmp1 149 cmp tmp1, #8 150 151 and tmp3, limit_wd, #15 152 lsr limit_wd, limit_wd, #4 153 mov tmp2, #~0 154 155 ldp data1, data2, [src], #16 156 lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ 157 add tmp3, tmp3, tmp1 158 159#ifdef __AARCH64EB__ 160 /* Big-endian. Early bytes are at MSB. */ 161 lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 162#else 163 /* Little-endian. Early bytes are at LSB. */ 164 lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 165#endif 166 add limit_wd, limit_wd, tmp3, lsr #4 167 168 orr data1, data1, tmp2 169 orr data2a, data2, tmp2 170 171 csinv data1, data1, xzr, le 172 csel data2, data2, data2a, le 173 b .Lrealigned 174END(strnlen) 175