1/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc	.cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc	.cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg)	.cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
53#endif
54
55#ifndef ENTRY
56# define ENTRY(name)	\
57	.type name,  @function;	\
58	.globl name;	\
59	.p2align 4;	\
60name:	\
61	cfi_startproc
62#endif
63
64#ifndef END
65# define END(name)	\
66	cfi_endproc;	\
67	.size name,	.-name
68#endif
69
70#define CFI_PUSH(REG)	\
71	cfi_adjust_cfa_offset (4);	\
72	cfi_rel_offset (REG, 0)
73
74#define CFI_POP(REG)	\
75	cfi_adjust_cfa_offset (-4);	\
76	cfi_restore (REG)
77
78#define PUSH(REG) pushl REG; CFI_PUSH (REG)
79#define POP(REG) popl REG; CFI_POP (REG)
80
81#define PARMS  4
82#define STR1  PARMS
83#define STR2  STR1+4
84#define LEN   STR2+4
85
86	.text
87ENTRY (memrchr)
88	mov	STR1(%esp), %ecx
89	movd	STR2(%esp), %xmm1
90	mov	LEN(%esp), %edx
91
92	test	%edx, %edx
93	jz	L(return_null)
94	sub	$16, %edx
95	jbe	L(length_less16)
96
97	punpcklbw %xmm1, %xmm1
98	add	%edx, %ecx
99	punpcklbw %xmm1, %xmm1
100
101	movdqu	(%ecx), %xmm0
102	pshufd	$0, %xmm1, %xmm1
103	pcmpeqb	%xmm1, %xmm0
104
105	pmovmskb %xmm0, %eax
106	test	%eax, %eax
107	jnz	L(exit_dispatch)
108
109	sub	$64, %ecx
110	mov	%ecx, %eax
111	and	$15, %eax
112	jz	L(loop_prolog)
113
114	add	$16, %ecx
115	add	$16, %edx
116	and	$-16, %ecx
117	sub	%eax, %edx
118
119	.p2align 4
120/* Loop start on aligned string.  */
121L(loop_prolog):
122	sub	$64, %edx
123	jbe	L(exit_loop)
124
125	movdqa	48(%ecx), %xmm0
126	pcmpeqb	%xmm1, %xmm0
127	pmovmskb %xmm0, %eax
128	test	%eax, %eax
129	jnz	L(matches48)
130
131	movdqa	32(%ecx), %xmm2
132	pcmpeqb	%xmm1, %xmm2
133	pmovmskb %xmm2, %eax
134	test	%eax, %eax
135	jnz	L(matches32)
136
137	movdqa	16(%ecx), %xmm3
138	pcmpeqb	%xmm1, %xmm3
139	pmovmskb %xmm3, %eax
140	test	%eax, %eax
141	jnz	L(matches16)
142
143	movdqa	(%ecx), %xmm4
144	pcmpeqb	%xmm1, %xmm4
145	pmovmskb %xmm4, %eax
146	test	%eax, %eax
147	jnz	L(exit_dispatch)
148
149	sub	$64, %ecx
150	sub	$64, %edx
151	jbe	L(exit_loop)
152
153	movdqa	48(%ecx), %xmm0
154	pcmpeqb	%xmm1, %xmm0
155	pmovmskb %xmm0, %eax
156	test	%eax, %eax
157	jnz	L(matches48)
158
159	movdqa	32(%ecx), %xmm2
160	pcmpeqb	%xmm1, %xmm2
161	pmovmskb %xmm2, %eax
162	test	%eax, %eax
163	jnz	L(matches32)
164
165	movdqa	16(%ecx), %xmm3
166	pcmpeqb	%xmm1, %xmm3
167	pmovmskb %xmm3, %eax
168	test	%eax, %eax
169	jnz	L(matches16)
170
171	movdqa	(%ecx), %xmm3
172	pcmpeqb	%xmm1, %xmm3
173	pmovmskb %xmm3, %eax
174	test	%eax, %eax
175	jnz	L(exit_dispatch)
176
177	mov	%ecx, %eax
178	and	$63, %eax
179	test	%eax, %eax
180	jz	L(align64_loop)
181
182	add	$64, %ecx
183	add	$64, %edx
184	and	$-64, %ecx
185	sub	%eax, %edx
186
187	.p2align 4
188L(align64_loop):
189	sub	$64, %ecx
190	sub	$64, %edx
191	jbe	L(exit_loop)
192
193	movdqa	(%ecx), %xmm0
194	movdqa	16(%ecx), %xmm2
195	movdqa	32(%ecx), %xmm3
196	movdqa	48(%ecx), %xmm4
197
198	pcmpeqb	%xmm1, %xmm0
199	pcmpeqb	%xmm1, %xmm2
200	pcmpeqb	%xmm1, %xmm3
201	pcmpeqb	%xmm1, %xmm4
202
203	pmaxub	%xmm3, %xmm0
204	pmaxub	%xmm4, %xmm2
205	pmaxub	%xmm0, %xmm2
206	pmovmskb %xmm2, %eax
207
208	test	%eax, %eax
209	jz	L(align64_loop)
210
211	pmovmskb %xmm4, %eax
212	test	%eax, %eax
213	jnz	L(matches48)
214
215	pmovmskb %xmm3, %eax
216	test	%eax, %eax
217	jnz	L(matches32)
218
219	movdqa	16(%ecx), %xmm2
220
221	pcmpeqb	%xmm1, %xmm2
222	pcmpeqb	(%ecx), %xmm1
223
224	pmovmskb %xmm2, %eax
225	test	%eax, %eax
226	jnz	L(matches16)
227
228	pmovmskb %xmm1, %eax
229	test	%ah, %ah
230	jnz	L(exit_dispatch_high)
231	mov	%al, %dl
232	and	$15 << 4, %dl
233	jnz	L(exit_dispatch_8)
234	test	$0x08, %al
235	jnz	L(exit_4)
236	test	$0x04, %al
237	jnz	L(exit_3)
238	test	$0x02, %al
239	jnz	L(exit_2)
240	mov	%ecx, %eax
241	ret
242
243	.p2align 4
244L(exit_loop):
245	add	$64, %edx
246	cmp	$32, %edx
247	jbe	L(exit_loop_32)
248
249	movdqa	48(%ecx), %xmm0
250	pcmpeqb	%xmm1, %xmm0
251	pmovmskb %xmm0, %eax
252	test	%eax, %eax
253	jnz	L(matches48)
254
255	movdqa	32(%ecx), %xmm2
256	pcmpeqb	%xmm1, %xmm2
257	pmovmskb %xmm2, %eax
258	test	%eax, %eax
259	jnz	L(matches32)
260
261	movdqa	16(%ecx), %xmm3
262	pcmpeqb	%xmm1, %xmm3
263	pmovmskb %xmm3, %eax
264	test	%eax, %eax
265	jnz	L(matches16_1)
266	cmp	$48, %edx
267	jbe	L(return_null)
268
269	pcmpeqb	(%ecx), %xmm1
270	pmovmskb %xmm1, %eax
271	test	%eax, %eax
272	jnz	L(matches0_1)
273	xor	%eax, %eax
274	ret
275
276	.p2align 4
277L(exit_loop_32):
278	movdqa	48(%ecx), %xmm0
279	pcmpeqb	%xmm1, %xmm0
280	pmovmskb %xmm0, %eax
281	test	%eax, %eax
282	jnz	L(matches48_1)
283	cmp	$16, %edx
284	jbe	L(return_null)
285
286	pcmpeqb	32(%ecx), %xmm1
287	pmovmskb %xmm1, %eax
288	test	%eax, %eax
289	jnz	L(matches32_1)
290	xor	%eax, %eax
291	ret
292
293	.p2align 4
294L(matches16):
295	lea	16(%ecx), %ecx
296	test	%ah, %ah
297	jnz	L(exit_dispatch_high)
298	mov	%al, %dl
299	and	$15 << 4, %dl
300	jnz	L(exit_dispatch_8)
301	test	$0x08, %al
302	jnz	L(exit_4)
303	test	$0x04, %al
304	jnz	L(exit_3)
305	test	$0x02, %al
306	jnz	L(exit_2)
307	mov	%ecx, %eax
308	ret
309
310	.p2align 4
311L(matches32):
312	lea	32(%ecx), %ecx
313	test	%ah, %ah
314	jnz	L(exit_dispatch_high)
315	mov	%al, %dl
316	and	$15 << 4, %dl
317	jnz	L(exit_dispatch_8)
318	test	$0x08, %al
319	jnz	L(exit_4)
320	test	$0x04, %al
321	jnz	L(exit_3)
322	test	$0x02, %al
323	jnz	L(exit_2)
324	mov	%ecx, %eax
325	ret
326
327	.p2align 4
328L(matches48):
329	lea	48(%ecx), %ecx
330
331	.p2align 4
332L(exit_dispatch):
333	test	%ah, %ah
334	jnz	L(exit_dispatch_high)
335	mov	%al, %dl
336	and	$15 << 4, %dl
337	jnz	L(exit_dispatch_8)
338	test	$0x08, %al
339	jnz	L(exit_4)
340	test	$0x04, %al
341	jnz	L(exit_3)
342	test	$0x02, %al
343	jnz	L(exit_2)
344	mov	%ecx, %eax
345	ret
346
347	.p2align 4
348L(exit_dispatch_8):
349	test	$0x80, %al
350	jnz	L(exit_8)
351	test	$0x40, %al
352	jnz	L(exit_7)
353	test	$0x20, %al
354	jnz	L(exit_6)
355	lea	4(%ecx), %eax
356	ret
357
358	.p2align 4
359L(exit_dispatch_high):
360	mov	%ah, %dh
361	and	$15 << 4, %dh
362	jnz	L(exit_dispatch_high_8)
363	test	$0x08, %ah
364	jnz	L(exit_12)
365	test	$0x04, %ah
366	jnz	L(exit_11)
367	test	$0x02, %ah
368	jnz	L(exit_10)
369	lea	8(%ecx), %eax
370	ret
371
372	.p2align 4
373L(exit_dispatch_high_8):
374	test	$0x80, %ah
375	jnz	L(exit_16)
376	test	$0x40, %ah
377	jnz	L(exit_15)
378	test	$0x20, %ah
379	jnz	L(exit_14)
380	lea	12(%ecx), %eax
381	ret
382
383	.p2align 4
384L(exit_2):
385	lea	1(%ecx), %eax
386	ret
387
388	.p2align 4
389L(exit_3):
390	lea	2(%ecx), %eax
391	ret
392
393	.p2align 4
394L(exit_4):
395	lea	3(%ecx), %eax
396	ret
397
398	.p2align 4
399L(exit_6):
400	lea	5(%ecx), %eax
401	ret
402
403	.p2align 4
404L(exit_7):
405	lea	6(%ecx), %eax
406	ret
407
408	.p2align 4
409L(exit_8):
410	lea	7(%ecx), %eax
411	ret
412
413	.p2align 4
414L(exit_10):
415	lea	9(%ecx), %eax
416	ret
417
418	.p2align 4
419L(exit_11):
420	lea	10(%ecx), %eax
421	ret
422
423	.p2align 4
424L(exit_12):
425	lea	11(%ecx), %eax
426	ret
427
428	.p2align 4
429L(exit_14):
430	lea	13(%ecx), %eax
431	ret
432
433	.p2align 4
434L(exit_15):
435	lea	14(%ecx), %eax
436	ret
437
438	.p2align 4
439L(exit_16):
440	lea	15(%ecx), %eax
441	ret
442
443	.p2align 4
444L(matches0_1):
445	lea	-64(%edx), %edx
446
447	test	%ah, %ah
448	jnz	L(exit_dispatch_1_high)
449	mov	%al, %ah
450	and	$15 << 4, %ah
451	jnz	L(exit_dispatch_1_8)
452	test	$0x08, %al
453	jnz	L(exit_1_4)
454	test	$0x04, %al
455	jnz	L(exit_1_3)
456	test	$0x02, %al
457	jnz	L(exit_1_2)
458
459	add	$0, %edx
460	jl	L(return_null)
461	mov	%ecx, %eax
462	ret
463
464	.p2align 4
465L(matches16_1):
466	lea	-48(%edx), %edx
467	lea	16(%ecx), %ecx
468
469	test	%ah, %ah
470	jnz	L(exit_dispatch_1_high)
471	mov	%al, %ah
472	and	$15 << 4, %ah
473	jnz	L(exit_dispatch_1_8)
474	test	$0x08, %al
475	jnz	L(exit_1_4)
476	test	$0x04, %al
477	jnz	L(exit_1_3)
478	test	$0x02, %al
479	jnz	L(exit_1_2)
480
481	add	$0, %edx
482	jl	L(return_null)
483	mov	%ecx, %eax
484	ret
485
486	.p2align 4
487L(matches32_1):
488	lea	-32(%edx), %edx
489	lea	32(%ecx), %ecx
490
491	test	%ah, %ah
492	jnz	L(exit_dispatch_1_high)
493	mov	%al, %ah
494	and	$15 << 4, %ah
495	jnz	L(exit_dispatch_1_8)
496	test	$0x08, %al
497	jnz	L(exit_1_4)
498	test	$0x04, %al
499	jnz	L(exit_1_3)
500	test	$0x02, %al
501	jnz	L(exit_1_2)
502
503	add	$0, %edx
504	jl	L(return_null)
505	mov	%ecx, %eax
506	ret
507
508	.p2align 4
509L(matches48_1):
510	lea	-16(%edx), %edx
511	lea	48(%ecx), %ecx
512
513	.p2align 4
514L(exit_dispatch_1):
515	test	%ah, %ah
516	jnz	L(exit_dispatch_1_high)
517	mov	%al, %ah
518	and	$15 << 4, %ah
519	jnz	L(exit_dispatch_1_8)
520	test	$0x08, %al
521	jnz	L(exit_1_4)
522	test	$0x04, %al
523	jnz	L(exit_1_3)
524	test	$0x02, %al
525	jnz	L(exit_1_2)
526
527	add	$0, %edx
528	jl	L(return_null)
529	mov	%ecx, %eax
530	ret
531
532	.p2align 4
533L(exit_dispatch_1_8):
534	test	$0x80, %al
535	jnz	L(exit_1_8)
536	test	$0x40, %al
537	jnz	L(exit_1_7)
538	test	$0x20, %al
539	jnz	L(exit_1_6)
540
541	add	$4, %edx
542	jl	L(return_null)
543	lea	4(%ecx), %eax
544	ret
545
546	.p2align 4
547L(exit_dispatch_1_high):
548	mov	%ah, %al
549	and	$15 << 4, %al
550	jnz	L(exit_dispatch_1_high_8)
551	test	$0x08, %ah
552	jnz	L(exit_1_12)
553	test	$0x04, %ah
554	jnz	L(exit_1_11)
555	test	$0x02, %ah
556	jnz	L(exit_1_10)
557
558	add	$8, %edx
559	jl	L(return_null)
560	lea	8(%ecx), %eax
561	ret
562
563	.p2align 4
564L(exit_dispatch_1_high_8):
565	test	$0x80, %ah
566	jnz	L(exit_1_16)
567	test	$0x40, %ah
568	jnz	L(exit_1_15)
569	test	$0x20, %ah
570	jnz	L(exit_1_14)
571
572	add	$12, %edx
573	jl	L(return_null)
574	lea	12(%ecx), %eax
575	ret
576
577	.p2align 4
578L(exit_1_2):
579	add	$1, %edx
580	jl	L(return_null)
581	lea	1(%ecx), %eax
582	ret
583
584	.p2align 4
585L(exit_1_3):
586	add	$2, %edx
587	jl	L(return_null)
588	lea	2(%ecx), %eax
589	ret
590
591	.p2align 4
592L(exit_1_4):
593	add	$3, %edx
594	jl	L(return_null)
595	lea	3(%ecx), %eax
596	ret
597
598	.p2align 4
599L(exit_1_6):
600	add	$5, %edx
601	jl	L(return_null)
602	lea	5(%ecx), %eax
603	ret
604
605	.p2align 4
606L(exit_1_7):
607	add	$6, %edx
608	jl	L(return_null)
609	lea	6(%ecx), %eax
610	ret
611
612	.p2align 4
613L(exit_1_8):
614	add	$7, %edx
615	jl	L(return_null)
616	lea	7(%ecx), %eax
617	ret
618
619	.p2align 4
620L(exit_1_10):
621	add	$9, %edx
622	jl	L(return_null)
623	lea	9(%ecx), %eax
624	ret
625
626	.p2align 4
627L(exit_1_11):
628	add	$10, %edx
629	jl	L(return_null)
630	lea	10(%ecx), %eax
631	ret
632
633	.p2align 4
634L(exit_1_12):
635	add	$11, %edx
636	jl	L(return_null)
637	lea	11(%ecx), %eax
638	ret
639
640	.p2align 4
641L(exit_1_14):
642	add	$13, %edx
643	jl	L(return_null)
644	lea	13(%ecx), %eax
645	ret
646
647	.p2align 4
648L(exit_1_15):
649	add	$14, %edx
650	jl	L(return_null)
651	lea	14(%ecx), %eax
652	ret
653
654	.p2align 4
655L(exit_1_16):
656	add	$15, %edx
657	jl	L(return_null)
658	lea	15(%ecx), %eax
659	ret
660
661	.p2align 4
662L(return_null):
663	xor	%eax, %eax
664	ret
665
666	.p2align 4
667L(length_less16_offset0):
668	mov	%dl, %cl
669	pcmpeqb	(%eax), %xmm1
670
671	mov	$1, %edx
672	sal	%cl, %edx
673	sub	$1, %edx
674
675	mov	%eax, %ecx
676	pmovmskb %xmm1, %eax
677
678	and	%edx, %eax
679	test	%eax, %eax
680	jnz	L(exit_dispatch)
681
682	xor	%eax, %eax
683	ret
684
685	.p2align 4
686L(length_less16):
687	punpcklbw %xmm1, %xmm1
688	add	$16, %edx
689	punpcklbw %xmm1, %xmm1
690
691	mov	%ecx, %eax
692	pshufd	$0, %xmm1, %xmm1
693
694	and	$15, %ecx
695	jz	L(length_less16_offset0)
696
697	PUSH	(%edi)
698
699	mov	%cl, %dh
700	add	%dl, %dh
701	and	$-16, %eax
702
703	sub	$16, %dh
704	ja	L(length_less16_part2)
705
706	pcmpeqb	(%eax), %xmm1
707	pmovmskb %xmm1, %edi
708
709	sar	%cl, %edi
710	add	%ecx, %eax
711	mov	%dl, %cl
712
713	mov	$1, %edx
714	sal	%cl, %edx
715	sub	$1, %edx
716
717	and	%edx, %edi
718	test	%edi, %edi
719	jz	L(ret_null)
720
721	bsr	%edi, %edi
722	add	%edi, %eax
723	POP	(%edi)
724	ret
725
726	CFI_PUSH     (%edi)
727
728	.p2align 4
729L(length_less16_part2):
730	movdqa	16(%eax), %xmm2
731	pcmpeqb	%xmm1, %xmm2
732	pmovmskb %xmm2, %edi
733
734	mov	%cl, %ch
735
736	mov	%dh, %cl
737	mov	$1, %edx
738	sal	%cl, %edx
739	sub	$1, %edx
740
741	and	%edx, %edi
742
743	test	%edi, %edi
744	jnz	L(length_less16_part2_return)
745
746	pcmpeqb	(%eax), %xmm1
747	pmovmskb %xmm1, %edi
748
749	mov	%ch, %cl
750	sar	%cl, %edi
751	test	%edi, %edi
752	jz	L(ret_null)
753
754	bsr	%edi, %edi
755	add	%edi, %eax
756	xor	%ch, %ch
757	add	%ecx, %eax
758	POP	(%edi)
759	ret
760
761	CFI_PUSH     (%edi)
762
763	.p2align 4
764L(length_less16_part2_return):
765	bsr	%edi, %edi
766	lea	16(%eax, %edi), %eax
767	POP	(%edi)
768	ret
769
770	CFI_PUSH     (%edi)
771
772	.p2align 4
773L(ret_null):
774	xor	%eax, %eax
775	POP	(%edi)
776	ret
777
778END (memrchr)
779