1/* Copyright (c) 2014, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 */
32
33#include <private/bionic_asm.h>
34
35/* Arguments and results.  */
36#define srcin		x0
37#define len		x0
38#define limit		x1
39
40/* Locals and temporaries.  */
41#define src		x2
42#define data1		x3
43#define data2		x4
44#define data2a		x5
45#define has_nul1	x6
46#define has_nul2	x7
47#define tmp1		x8
48#define tmp2		x9
49#define tmp3		x10
50#define tmp4		x11
51#define zeroones	x12
52#define pos		x13
53#define limit_wd	x14
54
55#define REP8_01 0x0101010101010101
56#define REP8_7f 0x7f7f7f7f7f7f7f7f
57#define REP8_80 0x8080808080808080
58
59	.text
60	.p2align	6
61.Lstart:
62	/* Pre-pad to ensure critical loop begins an icache line.  */
63	.rep 7
64	nop
65	.endr
66	/* Put this code here to avoid wasting more space with pre-padding.  */
67.Lhit_limit:
68	mov	len, limit
69	ret
70
71ENTRY(strnlen)
72	cbz	limit, .Lhit_limit
73	mov	zeroones, #REP8_01
74	bic	src, srcin, #15
75	ands	tmp1, srcin, #15
76	b.ne	.Lmisaligned
77	/* Calculate the number of full and partial words -1.  */
78	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
79	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
80
81	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
82	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
83	   can be done in parallel across the entire word.  */
84	/* The inner loop deals with two Dwords at a time.  This has a
85	   slightly higher start-up cost, but we should win quite quickly,
86	   especially on cores with a high number of issue slots per
87	   cycle, as we get much better parallelism out of the operations.  */
88
89	/* Start of critial section -- keep to one 64Byte cache line.  */
90.Lloop:
91	ldp	data1, data2, [src], #16
92.Lrealigned:
93	sub	tmp1, data1, zeroones
94	orr	tmp2, data1, #REP8_7f
95	sub	tmp3, data2, zeroones
96	orr	tmp4, data2, #REP8_7f
97	bic	has_nul1, tmp1, tmp2
98	bic	has_nul2, tmp3, tmp4
99	subs	limit_wd, limit_wd, #1
100	orr	tmp1, has_nul1, has_nul2
101	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
102	b.eq	.Lloop
103	/* End of critical section -- keep to one 64Byte cache line.  */
104
105	orr	tmp1, has_nul1, has_nul2
106	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
107
108	/* We know there's a null in the final Qword.  The easiest thing
109	   to do now is work out the length of the string and return
110	   MIN (len, limit).  */
111
112	sub	len, src, srcin
113	cbz	has_nul1, .Lnul_in_data2
114#ifdef __AARCH64EB__
115	mov	data2, data1
116#endif
117	sub	len, len, #8
118	mov	has_nul2, has_nul1
119.Lnul_in_data2:
120#ifdef __AARCH64EB__
121	/* For big-endian, carry propagation (if the final byte in the
122	   string is 0x01) means we cannot use has_nul directly.  The
123	   easiest way to get the correct byte is to byte-swap the data
124	   and calculate the syndrome a second time.  */
125	rev	data2, data2
126	sub	tmp1, data2, zeroones
127	orr	tmp2, data2, #REP8_7f
128	bic	has_nul2, tmp1, tmp2
129#endif
130	sub	len, len, #8
131	rev	has_nul2, has_nul2
132	clz	pos, has_nul2
133	add	len, len, pos, lsr #3		/* Bits to bytes.  */
134	cmp	len, limit
135	csel	len, len, limit, ls		/* Return the lower value.  */
136	ret
137
138.Lmisaligned:
139	/* Deal with a partial first word.
140	   We're doing two things in parallel here;
141	   1) Calculate the number of words (but avoiding overflow if
142	      limit is near ULONG_MAX) - to do this we need to work out
143	      limit + tmp1 - 1 as a 65-bit value before shifting it;
144	   2) Load and mask the initial data words - we force the bytes
145	      before the ones we are interested in to 0xff - this ensures
146	      early bytes will not hit any zero detection.  */
147	sub	limit_wd, limit, #1
148	neg	tmp4, tmp1
149	cmp	tmp1, #8
150
151	and	tmp3, limit_wd, #15
152	lsr	limit_wd, limit_wd, #4
153	mov	tmp2, #~0
154
155	ldp	data1, data2, [src], #16
156	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
157	add	tmp3, tmp3, tmp1
158
159#ifdef __AARCH64EB__
160	/* Big-endian.  Early bytes are at MSB.  */
161	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
162#else
163	/* Little-endian.  Early bytes are at LSB.  */
164	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
165#endif
166	add	limit_wd, limit_wd, tmp3, lsr #4
167
168	orr	data1, data1, tmp2
169	orr	data2a, data2, tmp2
170
171	csinv	data1, data1, xzr, le
172	csel	data2, data2, data2a, le
173	b	.Lrealigned
174END(strnlen)
175