1/*
2Copyright (c) 2011 Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc	.cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc	.cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg)	.cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
53#endif
54
55#ifndef ENTRY
56# define ENTRY(name)	\
57	.type name, @function;	\
58	.globl name;	\
59	.p2align 4;	\
60name:	\
61	cfi_startproc
62#endif
63
64#ifndef END
65# define END(name)	\
66	cfi_endproc;	\
67	.size name, .-name
68#endif
69
70#define CFI_PUSH(REG)	\
71	cfi_adjust_cfa_offset (4);	\
72	cfi_rel_offset (REG, 0)
73
74#define CFI_POP(REG)	\
75	cfi_adjust_cfa_offset (-4);	\
76	cfi_restore (REG)
77
78#define PUSH(REG)	pushl REG;	CFI_PUSH (REG)
79#define POP(REG)	popl REG;	CFI_POP (REG)
80
81#define PARMS  8
82#define ENTRANCE PUSH(%edi);
83#define RETURN  POP(%edi);	ret;	CFI_PUSH(%edi);
84
85#define STR1  PARMS
86#define STR2  STR1+4
87
88	.text
89ENTRY (wcsrchr)
90
91	ENTRANCE
92	mov	STR1(%esp), %ecx
93	movd	STR2(%esp), %xmm1
94
95	mov	%ecx, %edi
96	punpckldq %xmm1, %xmm1
97	pxor	%xmm2, %xmm2
98	punpckldq %xmm1, %xmm1
99
100/* ECX has OFFSET. */
101	and	$63, %ecx
102	cmp	$48, %ecx
103	ja	L(crosscache)
104
105/* unaligned string. */
106	movdqu	(%edi), %xmm0
107	pcmpeqd	%xmm0, %xmm2
108	pcmpeqd	%xmm1, %xmm0
109/* Find where NULL is.  */
110	pmovmskb %xmm2, %ecx
111/* Check if there is a match.  */
112	pmovmskb %xmm0, %eax
113	add	$16, %edi
114
115	test	%eax, %eax
116	jnz	L(unaligned_match1)
117
118	test	%ecx, %ecx
119	jnz	L(return_null)
120
121	and	$-16, %edi
122
123	PUSH	(%esi)
124
125	xor	%edx, %edx
126	jmp	L(loop)
127
128	CFI_POP	(%esi)
129
130	.p2align 4
131L(unaligned_match1):
132	test	%ecx, %ecx
133	jnz	L(prolog_find_zero_1)
134
135	PUSH	(%esi)
136
137/* Save current match */
138	mov	%eax, %edx
139	mov	%edi, %esi
140	and	$-16, %edi
141	jmp	L(loop)
142
143	CFI_POP	(%esi)
144
145	.p2align 4
146L(crosscache):
147/* Hancle unaligned string.  */
148	and	$15, %ecx
149	and	$-16, %edi
150	pxor	%xmm3, %xmm3
151	movdqa	(%edi), %xmm0
152	pcmpeqd	%xmm0, %xmm3
153	pcmpeqd	%xmm1, %xmm0
154/* Find where NULL is.  */
155	pmovmskb %xmm3, %edx
156/* Check if there is a match.  */
157	pmovmskb %xmm0, %eax
158/* Remove the leading bytes.  */
159	shr	%cl, %edx
160	shr	%cl, %eax
161	add	$16, %edi
162
163	test	%eax, %eax
164	jnz	L(unaligned_match)
165
166	test	%edx, %edx
167	jnz	L(return_null)
168
169	PUSH	(%esi)
170
171	xor	%edx, %edx
172	jmp	L(loop)
173
174	CFI_POP	(%esi)
175
176	.p2align 4
177L(unaligned_match):
178	test	%edx, %edx
179	jnz	L(prolog_find_zero)
180
181	PUSH	(%esi)
182
183	mov	%eax, %edx
184	lea	(%edi, %ecx), %esi
185
186/* Loop start on aligned string.  */
187	.p2align 4
188L(loop):
189	movdqa	(%edi), %xmm0
190	pcmpeqd	%xmm0, %xmm2
191	add	$16, %edi
192	pcmpeqd	%xmm1, %xmm0
193	pmovmskb %xmm2, %ecx
194	pmovmskb %xmm0, %eax
195	or	%eax, %ecx
196	jnz	L(matches)
197
198	movdqa	(%edi), %xmm3
199	pcmpeqd	%xmm3, %xmm2
200	add	$16, %edi
201	pcmpeqd	%xmm1, %xmm3
202	pmovmskb %xmm2, %ecx
203	pmovmskb %xmm3, %eax
204	or	%eax, %ecx
205	jnz	L(matches)
206
207	movdqa	(%edi), %xmm4
208	pcmpeqd	%xmm4, %xmm2
209	add	$16, %edi
210	pcmpeqd	%xmm1, %xmm4
211	pmovmskb %xmm2, %ecx
212	pmovmskb %xmm4, %eax
213	or	%eax, %ecx
214	jnz	L(matches)
215
216	movdqa	(%edi), %xmm5
217	pcmpeqd	%xmm5, %xmm2
218	add	$16, %edi
219	pcmpeqd	%xmm1, %xmm5
220	pmovmskb %xmm2, %ecx
221	pmovmskb %xmm5, %eax
222	or	%eax, %ecx
223	jz	L(loop)
224
225	.p2align 4
226L(matches):
227	test	%eax, %eax
228	jnz	L(match)
229L(return_value):
230	test	%edx, %edx
231	jz	L(return_null_1)
232	mov	%edx, %eax
233	mov	%esi, %edi
234
235	POP	(%esi)
236
237	test	%ah, %ah
238	jnz	L(match_third_or_fourth_wchar)
239	test	$15 << 4, %al
240	jnz	L(match_second_wchar)
241	lea	-16(%edi), %eax
242	RETURN
243
244	CFI_PUSH	(%esi)
245
246	.p2align 4
247L(return_null_1):
248	POP	(%esi)
249
250	xor	%eax, %eax
251	RETURN
252
253	CFI_PUSH	(%esi)
254
255	.p2align 4
256L(match):
257	pmovmskb %xmm2, %ecx
258	test	%ecx, %ecx
259	jnz	L(find_zero)
260/* save match info */
261	mov	%eax, %edx
262	mov	%edi, %esi
263	jmp	L(loop)
264
265	.p2align 4
266L(find_zero):
267	test	%cl, %cl
268	jz	L(find_zero_in_third_or_fourth_wchar)
269	test	$15, %cl
270	jz	L(find_zero_in_second_wchar)
271	and	$1, %eax
272	jz	L(return_value)
273
274	POP	(%esi)
275
276	lea	-16(%edi), %eax
277	RETURN
278
279	CFI_PUSH	(%esi)
280
281	.p2align 4
282L(find_zero_in_second_wchar):
283	and	$(1 << 5) - 1, %eax
284	jz	L(return_value)
285
286	POP	(%esi)
287
288	test	$15 << 4, %al
289	jnz	L(match_second_wchar)
290	lea	-16(%edi), %eax
291	RETURN
292
293	CFI_PUSH	(%esi)
294
295	.p2align 4
296L(find_zero_in_third_or_fourth_wchar):
297	test	$15, %ch
298	jz	L(find_zero_in_fourth_wchar)
299	and	$(1 << 9) - 1, %eax
300	jz	L(return_value)
301
302	POP	(%esi)
303
304	test	%ah, %ah
305	jnz	L(match_third_wchar)
306	test	$15 << 4, %al
307	jnz	L(match_second_wchar)
308	lea	-16(%edi), %eax
309	RETURN
310
311	CFI_PUSH	(%esi)
312
313	.p2align 4
314L(find_zero_in_fourth_wchar):
315
316	POP	(%esi)
317
318	test	%ah, %ah
319	jnz	L(match_third_or_fourth_wchar)
320	test	$15 << 4, %al
321	jnz	L(match_second_wchar)
322	lea	-16(%edi), %eax
323	RETURN
324
325	CFI_PUSH	(%esi)
326
327	.p2align 4
328L(match_second_wchar):
329	lea	-12(%edi), %eax
330	RETURN
331
332	.p2align 4
333L(match_third_or_fourth_wchar):
334	test	$15 << 4, %ah
335	jnz	L(match_fourth_wchar)
336	lea	-8(%edi), %eax
337	RETURN
338
339	.p2align 4
340L(match_third_wchar):
341	lea	-8(%edi), %eax
342	RETURN
343
344	.p2align 4
345L(match_fourth_wchar):
346	lea	-4(%edi), %eax
347	RETURN
348
349	.p2align 4
350L(return_null):
351	xor	%eax, %eax
352	RETURN
353
354	.p2align 4
355L(prolog_find_zero):
356	add	%ecx, %edi
357	mov     %edx, %ecx
358L(prolog_find_zero_1):
359	test	%cl, %cl
360	jz	L(prolog_find_zero_in_third_or_fourth_wchar)
361	test	$15, %cl
362	jz	L(prolog_find_zero_in_second_wchar)
363	and	$1, %eax
364	jz	L(return_null)
365
366	lea	-16(%edi), %eax
367	RETURN
368
369	.p2align 4
370L(prolog_find_zero_in_second_wchar):
371	and	$(1 << 5) - 1, %eax
372	jz	L(return_null)
373
374	test	$15 << 4, %al
375	jnz	L(match_second_wchar)
376	lea	-16(%edi), %eax
377	RETURN
378
379	.p2align 4
380L(prolog_find_zero_in_third_or_fourth_wchar):
381	test	$15, %ch
382	jz	L(prolog_find_zero_in_fourth_wchar)
383	and	$(1 << 9) - 1, %eax
384	jz	L(return_null)
385
386	test	%ah, %ah
387	jnz	L(match_third_wchar)
388	test	$15 << 4, %al
389	jnz	L(match_second_wchar)
390	lea	-16(%edi), %eax
391	RETURN
392
393	.p2align 4
394L(prolog_find_zero_in_fourth_wchar):
395	test	%ah, %ah
396	jnz	L(match_third_or_fourth_wchar)
397	test	$15 << 4, %al
398	jnz	L(match_second_wchar)
399	lea	-16(%edi), %eax
400	RETURN
401
402END (wcsrchr)
403