memchr.S revision 8c20c13100d159ff505af9e6e19cab30f368a074
1/*
2 * memchr - find a character in a memory zone
3 *
4 * Copyright (c) 2014, ARM Limited
5 * All rights Reserved.
6 * Copyright (c) 2014, Linaro Ltd.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *     * Redistributions of source code must retain the above copyright
11 *       notice, this list of conditions and the following disclaimer.
12 *     * Redistributions in binary form must reproduce the above copyright
13 *       notice, this list of conditions and the following disclaimer in the
14 *       documentation and/or other materials provided with the distribution.
15 *     * Neither the name of the company nor the names of its contributors
16 *       may be used to endorse or promote products derived from this
17 *       software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/* Assumptions:
33 *
34 * ARMv8-a, AArch64
35 * Neon Available.
36 */
37
38#include <private/bionic_asm.h>
39
40/* Arguments and results.  */
41#define srcin		x0
42#define chrin		w1
43#define cntin		x2
44
45#define result		x0
46
47#define src		x3
48#define	tmp		x4
49#define wtmp2		w5
50#define synd		x6
51#define soff		x9
52#define cntrem		x10
53
54#define vrepchr		v0
55#define vdata1		v1
56#define vdata2		v2
57#define vhas_chr1	v3
58#define vhas_chr2	v4
59#define vrepmask	v5
60#define vend		v6
61
62/*
63 * Core algorithm:
64 *
65 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
66 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
67 * requested character and bit 1 is not used (faster than using a 32bit
68 * syndrome). Since the bits in the syndrome reflect exactly the order in which
69 * things occur in the original string, counting trailing zeros allows to
70 * identify exactly which byte has matched.
71 */
72
73ENTRY(memchr)
74	/*
75	 * Magic constant 0x40100401 allows us to identify which lane matches
76	 * the requested byte.
77	 */
78	mov	wtmp2, #0x0401
79	movk	wtmp2, #0x4010, lsl #16
80	dup	vrepchr.16b, chrin
81	/* Work with aligned 32-byte chunks */
82	bic	src, srcin, #31
83	dup	vrepmask.4s, wtmp2
84	ands	soff, srcin, #31
85	and	cntrem, cntin, #31
86	b.eq	.Lloop
87
88	/*
89	 * Input string is not 32-byte aligned. We calculate the syndrome
90	 * value for the aligned 32 bytes block containing the first bytes
91	 * and mask the irrelevant part.
92	 */
93
94	ld1	{vdata1.16b, vdata2.16b}, [src], #32
95	sub	tmp, soff, #32
96	adds	cntin, cntin, tmp
97	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
98	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
99	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
100	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
101	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
102	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
103	mov	synd, vend.2d[0]
104	/* Clear the soff*2 lower bits */
105	lsl	tmp, soff, #1
106	lsr	synd, synd, tmp
107	lsl	synd, synd, tmp
108	/* The first block can also be the last */
109	b.ls	.Lmasklast
110	/* Have we found something already? */
111	cbnz	synd, .Ltail
112
113.Lloop:
114	ld1	{vdata1.16b, vdata2.16b}, [src], #32
115	subs	cntin, cntin, #32
116	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
117	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
118	/* If we're out of data we finish regardless of the result */
119	b.ls	.Lend
120	/* Use a fast check for the termination condition */
121	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
122	addp	vend.2d, vend.2d, vend.2d
123	mov	synd, vend.2d[0]
124	/* We're not out of data, loop if we haven't found the character */
125	cbz	synd, .Lloop
126
127.Lend:
128	/* Termination condition found, let's calculate the syndrome value */
129	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
130	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
131	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
132	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
133	mov	synd, vend.2d[0]
134	/* Only do the clear for the last possible block */
135	b.hi	.Ltail
136
137.Lmasklast:
138	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
139	add	tmp, cntrem, soff
140	and	tmp, tmp, #31
141	sub	tmp, tmp, #32
142	neg	tmp, tmp, lsl #1
143	lsl	synd, synd, tmp
144	lsr	synd, synd, tmp
145
146.Ltail:
147	/* Count the trailing zeros using bit reversing */
148	rbit	synd, synd
149	/* Compensate the last post-increment */
150	sub	src, src, #32
151	/* Check that we have found a character */
152	cmp	synd, #0
153	/* And count the leading zeros */
154	clz	synd, synd
155	/* Compute the potential result */
156	add	result, src, synd, lsr #1
157	/* Select result or NULL */
158	csel	result, xzr, result, eq
159	ret
160END(memchr)
161