1/* Copyright (c) 2013-2015, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7	 notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9	 notice, this list of conditions and the following disclaimer in the
10	 documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12	 names of its contributors may be used to endorse or promote products
13	 derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
30 */
31
32#include <private/bionic_asm.h>
33
34/* To test the page crossing code path more thoroughly, compile with
35   -DTEST_PAGE_CROSS - this will force all calls through the slower
36   entry path.  This option is not intended for production use.	 */
37
38/* Arguments and results.  */
39#define srcin		x0
40#define len		x0
41
42/* Locals and temporaries.  */
43#define src		x1
44#define data1		x2
45#define data2		x3
46#define has_nul1	x4
47#define has_nul2	x5
48#define tmp1		x4
49#define tmp2		x5
50#define tmp3		x6
51#define tmp4		x7
52#define zeroones	x8
53
54#define L(l) .L ## l
55
56	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
57	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
58	   can be done in parallel across the entire word. A faster check
59	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
60	   false hits for characters 129..255.	*/
61
62#define REP8_01 0x0101010101010101
63#define REP8_7f 0x7f7f7f7f7f7f7f7f
64#define REP8_80 0x8080808080808080
65
66#ifdef TEST_PAGE_CROSS
67# define MIN_PAGE_SIZE 15
68#else
69# define MIN_PAGE_SIZE 4096
70#endif
71
72	/* Since strings are short on average, we check the first 16 bytes
73	   of the string for a NUL character.  In order to do an unaligned ldp
74	   safely we have to do a page cross check first.  If there is a NUL
75	   byte we calculate the length from the 2 8-byte words using
76	   conditional select to reduce branch mispredictions (it is unlikely
77	   strlen will be repeatedly called on strings with the same length).
78
79	   If the string is longer than 16 bytes, we align src so don't need
80	   further page cross checks, and process 32 bytes per iteration
81	   using the fast NUL check.  If we encounter non-ASCII characters,
82	   fallback to a second loop using the full NUL check.
83
84	   If the page cross check fails, we read 16 bytes from an aligned
85	   address, remove any characters before the string, and continue
86	   in the main loop using aligned loads.  Since strings crossing a
87	   page in the first 16 bytes are rare (probability of
88	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
89
90	   AArch64 systems have a minimum page size of 4k.  We don't bother
91	   checking for larger page sizes - the cost of setting up the correct
92	   page size is just not worth the extra gain from a small reduction in
93	   the cases taking the slow path.  Note that we only care about
94	   whether the first fetch, which may be misaligned, crosses a page
95	   boundary.  */
96
97ENTRY(strlen)
98	and	tmp1, srcin, MIN_PAGE_SIZE - 1
99	mov	zeroones, REP8_01
100	cmp	tmp1, MIN_PAGE_SIZE - 16
101	b.gt	L(page_cross)
102	ldp	data1, data2, [srcin]
103#ifdef __AARCH64EB__
104	/* For big-endian, carry propagation (if the final byte in the
105	   string is 0x01) means we cannot use has_nul1/2 directly.
106	   Since we expect strings to be small and early-exit,
107	   byte-swap the data now so has_null1/2 will be correct.  */
108	rev	data1, data1
109	rev	data2, data2
110#endif
111	sub	tmp1, data1, zeroones
112	orr	tmp2, data1, REP8_7f
113	sub	tmp3, data2, zeroones
114	orr	tmp4, data2, REP8_7f
115	bics	has_nul1, tmp1, tmp2
116	bic	has_nul2, tmp3, tmp4
117	ccmp	has_nul2, 0, 0, eq
118	beq	L(main_loop_entry)
119
120	/* Enter with C = has_nul1 == 0.  */
121	csel	has_nul1, has_nul1, has_nul2, cc
122	mov	len, 8
123	rev	has_nul1, has_nul1
124	clz	tmp1, has_nul1
125	csel	len, xzr, len, cc
126	add	len, len, tmp1, lsr 3
127	ret
128
129	/* The inner loop processes 32 bytes per iteration and uses the fast
130	   NUL check.  If we encounter non-ASCII characters, use a second
131	   loop with the accurate NUL check.  */
132	.p2align 4
133L(main_loop_entry):
134	bic	src, srcin, 15
135	sub	src, src, 16
136L(main_loop):
137	ldp	data1, data2, [src, 32]!
138.Lpage_cross_entry:
139	sub	tmp1, data1, zeroones
140	sub	tmp3, data2, zeroones
141	orr	tmp2, tmp1, tmp3
142	tst	tmp2, zeroones, lsl 7
143	bne	1f
144	ldp	data1, data2, [src, 16]
145	sub	tmp1, data1, zeroones
146	sub	tmp3, data2, zeroones
147	orr	tmp2, tmp1, tmp3
148	tst	tmp2, zeroones, lsl 7
149	beq	L(main_loop)
150	add	src, src, 16
1511:
152	/* The fast check failed, so do the slower, accurate NUL check.	 */
153	orr	tmp2, data1, REP8_7f
154	orr	tmp4, data2, REP8_7f
155	bics	has_nul1, tmp1, tmp2
156	bic	has_nul2, tmp3, tmp4
157	ccmp	has_nul2, 0, 0, eq
158	beq	L(nonascii_loop)
159
160	/* Enter with C = has_nul1 == 0.  */
161L(tail):
162#ifdef __AARCH64EB__
163	/* For big-endian, carry propagation (if the final byte in the
164	   string is 0x01) means we cannot use has_nul1/2 directly.  The
165	   easiest way to get the correct byte is to byte-swap the data
166	   and calculate the syndrome a second time.  */
167	csel	data1, data1, data2, cc
168	rev	data1, data1
169	sub	tmp1, data1, zeroones
170	orr	tmp2, data1, REP8_7f
171	bic	has_nul1, tmp1, tmp2
172#else
173	csel	has_nul1, has_nul1, has_nul2, cc
174#endif
175	sub	len, src, srcin
176	rev	has_nul1, has_nul1
177	add	tmp2, len, 8
178	clz	tmp1, has_nul1
179	csel	len, len, tmp2, cc
180	add	len, len, tmp1, lsr 3
181	ret
182
183L(nonascii_loop):
184	ldp	data1, data2, [src, 16]!
185	sub	tmp1, data1, zeroones
186	orr	tmp2, data1, REP8_7f
187	sub	tmp3, data2, zeroones
188	orr	tmp4, data2, REP8_7f
189	bics	has_nul1, tmp1, tmp2
190	bic	has_nul2, tmp3, tmp4
191	ccmp	has_nul2, 0, 0, eq
192	bne	L(tail)
193	ldp	data1, data2, [src, 16]!
194	sub	tmp1, data1, zeroones
195	orr	tmp2, data1, REP8_7f
196	sub	tmp3, data2, zeroones
197	orr	tmp4, data2, REP8_7f
198	bics	has_nul1, tmp1, tmp2
199	bic	has_nul2, tmp3, tmp4
200	ccmp	has_nul2, 0, 0, eq
201	beq	L(nonascii_loop)
202	b	L(tail)
203
204	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
205	   srcin to 0x7f, so we ignore any NUL bytes before the string.
206	   Then continue in the aligned loop.  */
207L(page_cross):
208	bic	src, srcin, 15
209	ldp	data1, data2, [src]
210	lsl	tmp1, srcin, 3
211	mov	tmp4, -1
212#ifdef __AARCH64EB__
213	/* Big-endian.	Early bytes are at MSB.	 */
214	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
215#else
216	/* Little-endian.  Early bytes are at LSB.  */
217	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
218#endif
219	orr	tmp1, tmp1, REP8_80
220	orn	data1, data1, tmp1
221	orn	tmp2, data2, tmp1
222	tst	srcin, 8
223	csel	data1, data1, tmp4, eq
224	csel	data2, data2, tmp2, eq
225	b	L(page_cross_entry)
226
227END(strlen)
228