1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef STRLEN
32# define STRLEN strlen
33#endif
34
35#ifndef L
36# define L(label)	.L##label
37#endif
38
39#ifndef cfi_startproc
40# define cfi_startproc	.cfi_startproc
41#endif
42
43#ifndef cfi_endproc
44# define cfi_endproc	.cfi_endproc
45#endif
46
47#ifndef cfi_rel_offset
48# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
49#endif
50
51#ifndef cfi_restore
52# define cfi_restore(reg)	.cfi_restore reg
53#endif
54
55#ifndef cfi_adjust_cfa_offset
56# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
57#endif
58
59#ifndef ENTRY
60# define ENTRY(name)             \
61	.type name,  @function;  \
62	.globl name;             \
63	.p2align 4;              \
64name:                            \
65	cfi_startproc
66#endif
67
68#ifndef END
69# define END(name)               \
70	cfi_endproc;             \
71	.size name,	.-name
72#endif
73
74#define CFI_PUSH(REG)                   \
75	cfi_adjust_cfa_offset (4);      \
76	cfi_rel_offset (REG, 0)
77
78#define CFI_POP(REG)                    \
79	cfi_adjust_cfa_offset (-4);     \
80	cfi_restore (REG)
81
82#define PUSH(REG) pushl REG; CFI_PUSH (REG)
83#define POP(REG) popl REG; CFI_POP (REG)
84
85	.section .text.sse2,"ax",@progbits
86ENTRY (STRLEN)
87	mov	4(%esp), %edx
88	mov	%edx, %ecx
89	and	$0x3f, %ecx
90	pxor	%xmm0, %xmm0
91	cmp	$0x30, %ecx
92	ja	L(next)
93	movdqu	(%edx), %xmm1
94	pcmpeqb	%xmm1, %xmm0
95	pmovmskb %xmm0, %ecx
96	test	%ecx, %ecx
97	jnz	L(exit_less16)
98	mov	%edx, %eax
99	and	$-16, %eax
100	jmp	L(align16_start)
101L(next):
102	mov	%edx, %eax
103	and	$-16, %eax
104	PUSH	(%edi)
105	pcmpeqb	(%eax), %xmm0
106	mov	$-1, %edi
107	sub	%eax, %ecx
108	shl	%cl, %edi
109	pmovmskb %xmm0, %ecx
110	and	%edi, %ecx
111	POP	(%edi)
112	jnz	L(exit_unaligned)
113	pxor	%xmm0, %xmm0
114L(align16_start):
115	pxor	%xmm1, %xmm1
116	pxor	%xmm2, %xmm2
117	pxor	%xmm3, %xmm3
118	pcmpeqb	16(%eax), %xmm0
119	pmovmskb %xmm0, %ecx
120	test	%ecx, %ecx
121	jnz	L(exit16)
122
123	pcmpeqb	32(%eax), %xmm1
124	pmovmskb %xmm1, %ecx
125	test	%ecx, %ecx
126	jnz	L(exit32)
127
128	pcmpeqb	48(%eax), %xmm2
129	pmovmskb %xmm2, %ecx
130	test	%ecx, %ecx
131	jnz	L(exit48)
132
133	pcmpeqb	64(%eax), %xmm3
134	pmovmskb %xmm3, %ecx
135	test	%ecx, %ecx
136	jnz	L(exit64)
137
138	pcmpeqb	80(%eax), %xmm0
139	add	$64, %eax
140	pmovmskb %xmm0, %ecx
141	test	%ecx, %ecx
142	jnz	L(exit16)
143
144	pcmpeqb	32(%eax), %xmm1
145	pmovmskb %xmm1, %ecx
146	test	%ecx, %ecx
147	jnz	L(exit32)
148
149	pcmpeqb	48(%eax), %xmm2
150	pmovmskb %xmm2, %ecx
151	test	%ecx, %ecx
152	jnz	L(exit48)
153
154	pcmpeqb	64(%eax), %xmm3
155	pmovmskb %xmm3, %ecx
156	test	%ecx, %ecx
157	jnz	L(exit64)
158
159	pcmpeqb	80(%eax), %xmm0
160	add	$64, %eax
161	pmovmskb %xmm0, %ecx
162	test	%ecx, %ecx
163	jnz	L(exit16)
164
165	pcmpeqb	32(%eax), %xmm1
166	pmovmskb %xmm1, %ecx
167	test	%ecx, %ecx
168	jnz	L(exit32)
169
170	pcmpeqb	48(%eax), %xmm2
171	pmovmskb %xmm2, %ecx
172	test	%ecx, %ecx
173	jnz	L(exit48)
174
175	pcmpeqb	64(%eax), %xmm3
176	pmovmskb %xmm3, %ecx
177	test	%ecx, %ecx
178	jnz	L(exit64)
179
180	pcmpeqb	80(%eax), %xmm0
181	add	$64, %eax
182	pmovmskb %xmm0, %ecx
183	test	%ecx, %ecx
184	jnz	L(exit16)
185
186	pcmpeqb	32(%eax), %xmm1
187	pmovmskb %xmm1, %ecx
188	test	%ecx, %ecx
189	jnz	L(exit32)
190
191	pcmpeqb	48(%eax), %xmm2
192	pmovmskb %xmm2, %ecx
193	test	%ecx, %ecx
194	jnz	L(exit48)
195
196	pcmpeqb	64(%eax), %xmm3
197	pmovmskb %xmm3, %ecx
198	test	%ecx, %ecx
199	jnz	L(exit64)
200
201
202	test	$0x3f, %eax
203	jz	L(align64_loop)
204
205	pcmpeqb	80(%eax), %xmm0
206	add	$80, %eax
207	pmovmskb %xmm0, %ecx
208	test	%ecx, %ecx
209	jnz	L(exit)
210
211	test	$0x3f, %eax
212	jz	L(align64_loop)
213
214	pcmpeqb	16(%eax), %xmm1
215	add	$16, %eax
216	pmovmskb %xmm1, %ecx
217	test	%ecx, %ecx
218	jnz	L(exit)
219
220	test	$0x3f, %eax
221	jz	L(align64_loop)
222
223	pcmpeqb	16(%eax), %xmm2
224	add	$16, %eax
225	pmovmskb %xmm2, %ecx
226	test	%ecx, %ecx
227	jnz	L(exit)
228
229	test	$0x3f, %eax
230	jz	L(align64_loop)
231
232	pcmpeqb	16(%eax), %xmm3
233	add	$16, %eax
234	pmovmskb %xmm3, %ecx
235	test	%ecx, %ecx
236	jnz	L(exit)
237
238	add	$16, %eax
239	.p2align 4
240L(align64_loop):
241	movaps	(%eax),	%xmm4
242	pminub	16(%eax), 	%xmm4
243	movaps	32(%eax), 	%xmm5
244	pminub	48(%eax), 	%xmm5
245	add	$64, 	%eax
246	pminub	%xmm4,	%xmm5
247	pcmpeqb	%xmm0,	%xmm5
248	pmovmskb %xmm5,	%ecx
249	test	%ecx,	%ecx
250	jz	L(align64_loop)
251
252
253	pcmpeqb	-64(%eax), %xmm0
254	sub	$80, 	%eax
255	pmovmskb %xmm0, %ecx
256	test	%ecx, %ecx
257	jnz	L(exit16)
258
259	pcmpeqb	32(%eax), %xmm1
260	pmovmskb %xmm1, %ecx
261	test	%ecx, %ecx
262	jnz	L(exit32)
263
264	pcmpeqb	48(%eax), %xmm2
265	pmovmskb %xmm2, %ecx
266	test	%ecx, %ecx
267	jnz	L(exit48)
268
269	pcmpeqb	64(%eax), %xmm3
270	pmovmskb %xmm3, %ecx
271	sub	%edx, %eax
272	bsf	%ecx, %ecx
273	add	%ecx, %eax
274	add	$64, %eax
275	ret
276
277	.p2align 4
278L(exit):
279	sub	%edx, %eax
280	bsf	%ecx, %ecx
281	add	%ecx, %eax
282	ret
283
284L(exit_less16):
285	bsf	%ecx, %eax
286	ret
287
288	.p2align 4
289L(exit_unaligned):
290	sub	%edx, %eax
291	bsf	%ecx, %ecx
292	add	%ecx, %eax
293	ret
294
295	.p2align 4
296L(exit16):
297	sub	%edx, %eax
298	bsf	%ecx, %ecx
299	add	%ecx, %eax
300	add	$16, %eax
301	ret
302
303	.p2align 4
304L(exit32):
305	sub	%edx, %eax
306	bsf	%ecx, %ecx
307	add	%ecx, %eax
308	add	$32, %eax
309	ret
310
311	.p2align 4
312L(exit48):
313	sub	%edx, %eax
314	bsf	%ecx, %ecx
315	add	%ecx, %eax
316	add	$48, %eax
317	ret
318
319	.p2align 4
320L(exit64):
321	sub	%edx, %eax
322	bsf	%ecx, %ecx
323	add	%ecx, %eax
324	add	$64, %eax
325	ret
326
327END (STRLEN)
328
329