1/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc	.cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc	.cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg)	.cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
53#endif
54
55#ifndef ENTRY
56# define ENTRY(name)	\
57	.type name,  @function;	\
58	.globl name;	\
59	.p2align 4;	\
60name:	\
61	cfi_startproc
62#endif
63
64#ifndef END
65# define END(name)	\
66	cfi_endproc;	\
67	.size name,	.-name
68#endif
69
70#define CFI_PUSH(REG)	\
71	cfi_adjust_cfa_offset (4);	\
72	cfi_rel_offset (REG, 0)
73
74#define CFI_POP(REG)	\
75	cfi_adjust_cfa_offset (-4);	\
76	cfi_restore (REG)
77
78#define PUSH(REG) pushl REG; CFI_PUSH (REG)
79#define POP(REG) popl REG; CFI_POP (REG)
80
81#define ENTRANCE PUSH (%edi);
82#define PARMS  8
83#define RETURN  POP (%edi); ret; CFI_PUSH (%edi);
84
85#define STR1  PARMS
86#define STR2  STR1+4
87#define LEN   STR2+4
88
89	.text
90ENTRY (memchr)
91	ENTRANCE
92	mov	STR1(%esp), %ecx
93	movd	STR2(%esp), %xmm1
94	mov	LEN(%esp), %edx
95	test	%edx, %edx
96	jz	L(return_null)
97
98	punpcklbw %xmm1, %xmm1
99	mov	%ecx, %edi
100	punpcklbw %xmm1, %xmm1
101
102	and	$63, %ecx
103	pshufd	$0, %xmm1, %xmm1
104	cmp	$48, %ecx
105	ja	L(crosscache)
106
107	movdqu	(%edi), %xmm0
108	pcmpeqb	%xmm1, %xmm0
109	pmovmskb %xmm0, %eax
110	test	%eax, %eax
111	jnz	L(match_case2_prolog)
112
113	sub	$16, %edx
114	jbe	L(return_null)
115	lea	16(%edi), %edi
116	and	$15, %ecx
117	and	$-16, %edi
118	add	%ecx, %edx
119	sub	$64, %edx
120	jbe	L(exit_loop)
121	jmp	L(loop_prolog)
122
123	.p2align 4
124L(crosscache):
125	and	$15, %ecx
126	and	$-16, %edi
127	movdqa	(%edi), %xmm0
128	pcmpeqb	%xmm1, %xmm0
129	pmovmskb %xmm0, %eax
130	sar	%cl, %eax
131	test	%eax, %eax
132
133	jnz	L(match_case2_prolog1)
134	lea	-16(%edx), %edx
135	add	%ecx, %edx
136	jle	L(return_null)
137	lea	16(%edi), %edi
138	sub	$64, %edx
139	jbe	L(exit_loop)
140
141	.p2align 4
142L(loop_prolog):
143	movdqa	(%edi), %xmm0
144	pcmpeqb	%xmm1, %xmm0
145	xor	%ecx, %ecx
146	pmovmskb %xmm0, %eax
147	test	%eax, %eax
148	jnz	L(match_case1)
149
150	movdqa	16(%edi), %xmm2
151	pcmpeqb	%xmm1, %xmm2
152	lea	16(%ecx), %ecx
153	pmovmskb %xmm2, %eax
154	test	%eax, %eax
155	jnz	L(match_case1)
156
157	movdqa	32(%edi), %xmm3
158	pcmpeqb	%xmm1, %xmm3
159	lea	16(%ecx), %ecx
160	pmovmskb %xmm3, %eax
161	test	%eax, %eax
162	jnz	L(match_case1)
163
164	movdqa	48(%edi), %xmm4
165	pcmpeqb	%xmm1, %xmm4
166	lea	16(%ecx), %ecx
167	pmovmskb %xmm4, %eax
168	test	%eax, %eax
169	jnz	L(match_case1)
170
171	lea	64(%edi), %edi
172	sub	$64, %edx
173	jbe	L(exit_loop)
174
175	movdqa	(%edi), %xmm0
176	pcmpeqb	%xmm1, %xmm0
177	xor	%ecx, %ecx
178	pmovmskb %xmm0, %eax
179	test	%eax, %eax
180	jnz	L(match_case1)
181
182	movdqa	16(%edi), %xmm2
183	pcmpeqb	%xmm1, %xmm2
184	lea	16(%ecx), %ecx
185	pmovmskb %xmm2, %eax
186	test	%eax, %eax
187	jnz	L(match_case1)
188
189	movdqa	32(%edi), %xmm3
190	pcmpeqb	%xmm1, %xmm3
191	lea	16(%ecx), %ecx
192	pmovmskb %xmm3, %eax
193	test	%eax, %eax
194	jnz	L(match_case1)
195
196	movdqa	48(%edi), %xmm4
197	pcmpeqb	%xmm1, %xmm4
198	lea	16(%ecx), %ecx
199	pmovmskb %xmm4, %eax
200	test	%eax, %eax
201	jnz	L(match_case1)
202
203	lea	64(%edi), %edi
204	mov	%edi, %ecx
205	and	$-64, %edi
206	and	$63, %ecx
207	add	%ecx, %edx
208
209	.p2align 4
210L(align64_loop):
211	sub	$64, %edx
212	jbe	L(exit_loop)
213	movdqa	(%edi), %xmm0
214	movdqa	16(%edi), %xmm2
215	movdqa	32(%edi), %xmm3
216	movdqa	48(%edi), %xmm4
217	pcmpeqb	%xmm1, %xmm0
218	pcmpeqb	%xmm1, %xmm2
219	pcmpeqb	%xmm1, %xmm3
220	pcmpeqb	%xmm1, %xmm4
221
222	pmaxub	%xmm0, %xmm3
223	pmaxub	%xmm2, %xmm4
224	pmaxub	%xmm3, %xmm4
225	add	$64, %edi
226	pmovmskb %xmm4, %eax
227
228	test	%eax, %eax
229	jz	L(align64_loop)
230
231	sub	$64, %edi
232
233	pmovmskb %xmm0, %eax
234	xor	%ecx, %ecx
235	test	%eax, %eax
236	jnz	L(match_case1)
237
238	pmovmskb %xmm2, %eax
239	lea	16(%ecx), %ecx
240	test	%eax, %eax
241	jnz	L(match_case1)
242
243	movdqa	32(%edi), %xmm3
244	pcmpeqb	%xmm1, %xmm3
245	pmovmskb %xmm3, %eax
246	lea	16(%ecx), %ecx
247	test	%eax, %eax
248	jnz	L(match_case1)
249
250	pcmpeqb	48(%edi), %xmm1
251	pmovmskb %xmm1, %eax
252	lea	16(%ecx), %ecx
253
254	.p2align 4
255L(match_case1):
256	add	%ecx, %edi
257	test	%al, %al
258	jz	L(match_case1_high)
259	mov	%al, %cl
260	and	$15, %cl
261	jz	L(match_case1_8)
262	test	$0x01, %al
263	jnz	L(exit_case1_1)
264	test	$0x02, %al
265	jnz	L(exit_case1_2)
266	test	$0x04, %al
267	jnz	L(exit_case1_3)
268	lea	3(%edi), %eax
269	RETURN
270
271	.p2align 4
272L(match_case1_8):
273	test	$0x10, %al
274	jnz	L(exit_case1_5)
275	test	$0x20, %al
276	jnz	L(exit_case1_6)
277	test	$0x40, %al
278	jnz	L(exit_case1_7)
279	lea	7(%edi), %eax
280	RETURN
281
282	.p2align 4
283L(match_case1_high):
284	mov	%ah, %ch
285	and	$15, %ch
286	jz	L(match_case1_high_8)
287	test	$0x01, %ah
288	jnz	L(exit_case1_9)
289	test	$0x02, %ah
290	jnz	L(exit_case1_10)
291	test	$0x04, %ah
292	jnz	L(exit_case1_11)
293	lea	11(%edi), %eax
294	RETURN
295
296	.p2align 4
297L(match_case1_high_8):
298	test	$0x10, %ah
299	jnz	L(exit_case1_13)
300	test	$0x20, %ah
301	jnz	L(exit_case1_14)
302	test	$0x40, %ah
303	jnz	L(exit_case1_15)
304	lea	15(%edi), %eax
305	RETURN
306
307	.p2align 4
308L(exit_loop):
309	add	$64, %edx
310
311	movdqa	(%edi), %xmm0
312	pcmpeqb	%xmm1, %xmm0
313	xor	%ecx, %ecx
314	pmovmskb %xmm0, %eax
315	test	%eax, %eax
316	jnz	L(match_case2)
317	cmp	$16, %edx
318	jbe	L(return_null)
319
320	movdqa	16(%edi), %xmm2
321	pcmpeqb	%xmm1, %xmm2
322	lea	16(%ecx), %ecx
323	pmovmskb %xmm2, %eax
324	test	%eax, %eax
325	jnz	L(match_case2)
326	cmp	$32, %edx
327	jbe	L(return_null)
328
329	movdqa	32(%edi), %xmm3
330	pcmpeqb	%xmm1, %xmm3
331	lea	16(%ecx), %ecx
332	pmovmskb %xmm3, %eax
333	test	%eax, %eax
334	jnz	L(match_case2)
335	cmp	$48, %edx
336	jbe	L(return_null)
337
338	pcmpeqb	48(%edi), %xmm1
339	lea	16(%ecx), %ecx
340	pmovmskb %xmm1, %eax
341	test	%eax, %eax
342	jnz	L(match_case2)
343
344	xor	%eax, %eax
345	RETURN
346
347	.p2align 4
348L(exit_case1_1):
349	mov	%edi, %eax
350	RETURN
351
352	.p2align 4
353L(exit_case1_2):
354	lea	1(%edi), %eax
355	RETURN
356
357	.p2align 4
358L(exit_case1_3):
359	lea	2(%edi), %eax
360	RETURN
361
362	.p2align 4
363L(exit_case1_5):
364	lea	4(%edi), %eax
365	RETURN
366
367	.p2align 4
368L(exit_case1_6):
369	lea	5(%edi), %eax
370	RETURN
371
372	.p2align 4
373L(exit_case1_7):
374	lea	6(%edi), %eax
375	RETURN
376
377	.p2align 4
378L(exit_case1_9):
379	lea	8(%edi), %eax
380	RETURN
381
382	.p2align 4
383L(exit_case1_10):
384	lea	9(%edi), %eax
385	RETURN
386
387	.p2align 4
388L(exit_case1_11):
389	lea	10(%edi), %eax
390	RETURN
391
392	.p2align 4
393L(exit_case1_13):
394	lea	12(%edi), %eax
395	RETURN
396
397	.p2align 4
398L(exit_case1_14):
399	lea	13(%edi), %eax
400	RETURN
401
402	.p2align 4
403L(exit_case1_15):
404	lea	14(%edi), %eax
405	RETURN
406
407	.p2align 4
408L(match_case2):
409	sub	%ecx, %edx
410L(match_case2_prolog1):
411	add	%ecx, %edi
412L(match_case2_prolog):
413	test	%al, %al
414	jz	L(match_case2_high)
415	mov	%al, %cl
416	and	$15, %cl
417	jz	L(match_case2_8)
418	test	$0x01, %al
419	jnz	L(exit_case2_1)
420	test	$0x02, %al
421	jnz	L(exit_case2_2)
422	test	$0x04, %al
423	jnz	L(exit_case2_3)
424	sub	$4, %edx
425	jb	L(return_null)
426	lea	3(%edi), %eax
427	RETURN
428
429	.p2align 4
430L(match_case2_8):
431	test	$0x10, %al
432	jnz	L(exit_case2_5)
433	test	$0x20, %al
434	jnz	L(exit_case2_6)
435	test	$0x40, %al
436	jnz	L(exit_case2_7)
437	sub	$8, %edx
438	jb	L(return_null)
439	lea	7(%edi), %eax
440	RETURN
441
442	.p2align 4
443L(match_case2_high):
444	mov	%ah, %ch
445	and	$15, %ch
446	jz	L(match_case2_high_8)
447	test	$0x01, %ah
448	jnz	L(exit_case2_9)
449	test	$0x02, %ah
450	jnz	L(exit_case2_10)
451	test	$0x04, %ah
452	jnz	L(exit_case2_11)
453	sub	$12, %edx
454	jb	L(return_null)
455	lea	11(%edi), %eax
456	RETURN
457
458	.p2align 4
459L(match_case2_high_8):
460	test	$0x10, %ah
461	jnz	L(exit_case2_13)
462	test	$0x20, %ah
463	jnz	L(exit_case2_14)
464	test	$0x40, %ah
465	jnz	L(exit_case2_15)
466	sub	$16, %edx
467	jb	L(return_null)
468	lea	15(%edi), %eax
469	RETURN
470
471	.p2align 4
472L(exit_case2_1):
473	mov	%edi, %eax
474	RETURN
475
476	.p2align 4
477L(exit_case2_2):
478	sub	$2, %edx
479	jb	L(return_null)
480	lea	1(%edi), %eax
481	RETURN
482
483	.p2align 4
484L(exit_case2_3):
485	sub	$3, %edx
486	jb	L(return_null)
487	lea	2(%edi), %eax
488	RETURN
489
490	.p2align 4
491L(exit_case2_5):
492	sub	$5, %edx
493	jb	L(return_null)
494	lea	4(%edi), %eax
495	RETURN
496
497	.p2align 4
498L(exit_case2_6):
499	sub	$6, %edx
500	jb	L(return_null)
501	lea	5(%edi), %eax
502	RETURN
503
504	.p2align 4
505L(exit_case2_7):
506	sub	$7, %edx
507	jb	L(return_null)
508	lea	6(%edi), %eax
509	RETURN
510
511	.p2align 4
512L(exit_case2_9):
513	sub	$9, %edx
514	jb	L(return_null)
515	lea	8(%edi), %eax
516	RETURN
517
518	.p2align 4
519L(exit_case2_10):
520	sub	$10, %edx
521	jb	L(return_null)
522	lea	9(%edi), %eax
523	RETURN
524
525	.p2align 4
526L(exit_case2_11):
527	sub	$11, %edx
528	jb	L(return_null)
529	lea	10(%edi), %eax
530	RETURN
531
532	.p2align 4
533L(exit_case2_13):
534	sub	$13, %edx
535	jb	L(return_null)
536	lea	12(%edi), %eax
537	RETURN
538
539	.p2align 4
540L(exit_case2_14):
541	sub	$14, %edx
542	jb	L(return_null)
543	lea	13(%edi), %eax
544	RETURN
545
546	.p2align 4
547L(exit_case2_15):
548	sub	$15, %edx
549	jb	L(return_null)
550	lea	14(%edi), %eax
551	RETURN
552	.p2align 4
553L(return_null):
554	xor	%eax, %eax
555	RETURN
556END (memchr)
557