1/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef USE_AS_STRCAT
32
33# ifndef STRLEN
34#  define STRLEN strlen_atom
35# endif
36
37# ifndef L
38#  define L(label)	.L##label
39# endif
40
41# ifndef cfi_startproc
42#  define cfi_startproc	.cfi_startproc
43# endif
44
45# ifndef cfi_endproc
46#  define cfi_endproc	.cfi_endproc
47# endif
48
49/* calee safe register only for strnlen is required */
50
51# ifdef USE_AS_STRNLEN
52#  ifndef cfi_rel_offset
53#   define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
54#  endif
55
56#  ifndef cfi_restore
57#   define cfi_restore(reg)	.cfi_restore reg
58#  endif
59
60#  ifndef cfi_adjust_cfa_offset
61#   define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
62#  endif
63# endif
64
65# ifndef ENTRY
66#  define ENTRY(name)	\
67	.type name,  @function;	\
68	.globl name;	\
69	.p2align 4;	\
70name:	\
71	cfi_startproc
72# endif
73
74# ifndef END
75#  define END(name)	\
76	cfi_endproc;	\
77	.size name, .-name
78# endif
79
80# define PARMS	4
81# define STR	PARMS
82# define RETURN	ret
83
84# ifdef USE_AS_STRNLEN
85#  define LEN	PARMS + 8
86#  define CFI_PUSH(REG)	\
87	cfi_adjust_cfa_offset (4);	\
88	cfi_rel_offset (REG, 0)
89
90#  define CFI_POP(REG)	\
91	cfi_adjust_cfa_offset (-4);	\
92	cfi_restore (REG)
93
94#  define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
95#  define POP(REG)	popl	REG;	CFI_POP (REG)
96#  undef RETURN
97#  define RETURN	POP (%edi); ret; CFI_PUSH(%edi);
98# endif
99
100	.text
101ENTRY (STRLEN)
102	mov	STR(%esp), %edx
103# ifdef USE_AS_STRNLEN
104	PUSH	(%edi)
105	movl	LEN(%esp), %edi
106	sub	$4, %edi
107	jbe	L(len_less4_prolog)
108# endif
109#endif
110	xor	%eax, %eax
111	cmpb	$0, (%edx)
112	jz	L(exit_tail0)
113	cmpb	$0, 1(%edx)
114	jz	L(exit_tail1)
115	cmpb	$0, 2(%edx)
116	jz	L(exit_tail2)
117	cmpb	$0, 3(%edx)
118	jz	L(exit_tail3)
119
120#ifdef USE_AS_STRNLEN
121	sub	$4, %edi
122	jbe	L(len_less8_prolog)
123#endif
124
125	cmpb	$0, 4(%edx)
126	jz	L(exit_tail4)
127	cmpb	$0, 5(%edx)
128	jz	L(exit_tail5)
129	cmpb	$0, 6(%edx)
130	jz	L(exit_tail6)
131	cmpb	$0, 7(%edx)
132	jz	L(exit_tail7)
133
134#ifdef USE_AS_STRNLEN
135	sub	$4, %edi
136	jbe	L(len_less12_prolog)
137#endif
138
139	cmpb	$0, 8(%edx)
140	jz	L(exit_tail8)
141	cmpb	$0, 9(%edx)
142	jz	L(exit_tail9)
143	cmpb	$0, 10(%edx)
144	jz	L(exit_tail10)
145	cmpb	$0, 11(%edx)
146	jz	L(exit_tail11)
147
148#ifdef USE_AS_STRNLEN
149	sub	$4, %edi
150	jbe	L(len_less16_prolog)
151#endif
152
153	cmpb	$0, 12(%edx)
154	jz	L(exit_tail12)
155	cmpb	$0, 13(%edx)
156	jz	L(exit_tail13)
157	cmpb	$0, 14(%edx)
158	jz	L(exit_tail14)
159	cmpb	$0, 15(%edx)
160	jz	L(exit_tail15)
161
162	pxor	%xmm0, %xmm0
163	lea	16(%edx), %eax
164	mov	%eax, %ecx
165	and	$-16, %eax
166
167#ifdef USE_AS_STRNLEN
168	and	$15, %edx
169	add	%edx, %edi
170	sub	$64, %edi
171	jbe	L(len_less64)
172#endif
173
174	pcmpeqb	(%eax), %xmm0
175	pmovmskb %xmm0, %edx
176	pxor	%xmm1, %xmm1
177	lea	16(%eax), %eax
178	test	%edx, %edx
179	jnz	L(exit)
180
181	pcmpeqb	(%eax), %xmm1
182	pmovmskb %xmm1, %edx
183	pxor	%xmm2, %xmm2
184	lea	16(%eax), %eax
185	test	%edx, %edx
186	jnz	L(exit)
187
188	pcmpeqb	(%eax), %xmm2
189	pmovmskb %xmm2, %edx
190	pxor	%xmm3, %xmm3
191	lea	16(%eax), %eax
192	test	%edx, %edx
193	jnz	L(exit)
194
195	pcmpeqb	(%eax), %xmm3
196	pmovmskb %xmm3, %edx
197	lea	16(%eax), %eax
198	test	%edx, %edx
199	jnz	L(exit)
200
201#ifdef USE_AS_STRNLEN
202	sub	$64, %edi
203	jbe	L(len_less64)
204#endif
205
206	pcmpeqb	(%eax), %xmm0
207	pmovmskb %xmm0, %edx
208	lea	16(%eax), %eax
209	test	%edx, %edx
210	jnz	L(exit)
211
212	pcmpeqb	(%eax), %xmm1
213	pmovmskb %xmm1, %edx
214	lea	16(%eax), %eax
215	test	%edx, %edx
216	jnz	L(exit)
217
218	pcmpeqb	(%eax), %xmm2
219	pmovmskb %xmm2, %edx
220	lea	16(%eax), %eax
221	test	%edx, %edx
222	jnz	L(exit)
223
224	pcmpeqb	(%eax), %xmm3
225	pmovmskb %xmm3, %edx
226	lea	16(%eax), %eax
227	test	%edx, %edx
228	jnz	L(exit)
229
230#ifdef USE_AS_STRNLEN
231	sub	$64, %edi
232	jbe	L(len_less64)
233#endif
234
235	pcmpeqb	(%eax), %xmm0
236	pmovmskb %xmm0, %edx
237	lea	16(%eax), %eax
238	test	%edx, %edx
239	jnz	L(exit)
240
241	pcmpeqb	(%eax), %xmm1
242	pmovmskb %xmm1, %edx
243	lea	16(%eax), %eax
244	test	%edx, %edx
245	jnz	L(exit)
246
247	pcmpeqb	(%eax), %xmm2
248	pmovmskb %xmm2, %edx
249	lea	16(%eax), %eax
250	test	%edx, %edx
251	jnz	L(exit)
252
253	pcmpeqb	(%eax), %xmm3
254	pmovmskb %xmm3, %edx
255	lea	16(%eax), %eax
256	test	%edx, %edx
257	jnz	L(exit)
258
259#ifdef USE_AS_STRNLEN
260	sub	$64, %edi
261	jbe	L(len_less64)
262#endif
263
264	pcmpeqb	(%eax), %xmm0
265	pmovmskb %xmm0, %edx
266	lea	16(%eax), %eax
267	test	%edx, %edx
268	jnz	L(exit)
269
270	pcmpeqb	(%eax), %xmm1
271	pmovmskb %xmm1, %edx
272	lea	16(%eax), %eax
273	test	%edx, %edx
274	jnz	L(exit)
275
276	pcmpeqb	(%eax), %xmm2
277	pmovmskb %xmm2, %edx
278	lea	16(%eax), %eax
279	test	%edx, %edx
280	jnz	L(exit)
281
282	pcmpeqb	(%eax), %xmm3
283	pmovmskb %xmm3, %edx
284	lea	16(%eax), %eax
285	test	%edx, %edx
286	jnz	L(exit)
287
288#ifdef USE_AS_STRNLEN
289	mov	%eax, %edx
290	and	$63, %edx
291	add	%edx, %edi
292#endif
293
294	and	$-0x40, %eax
295
296	.p2align 4
297L(aligned_64_loop):
298#ifdef USE_AS_STRNLEN
299	sub	$64, %edi
300	jbe	L(len_less64)
301#endif
302	movaps	(%eax), %xmm0
303	movaps	16(%eax), %xmm1
304	movaps	32(%eax), %xmm2
305	movaps	48(%eax), %xmm6
306	pminub	%xmm1, %xmm0
307	pminub	%xmm6, %xmm2
308	pminub	%xmm0, %xmm2
309	pcmpeqb	%xmm3, %xmm2
310	pmovmskb %xmm2, %edx
311	lea	64(%eax), %eax
312	test	%edx, %edx
313	jz	L(aligned_64_loop)
314
315	pcmpeqb	-64(%eax), %xmm3
316	pmovmskb %xmm3, %edx
317	lea	48(%ecx), %ecx
318	test	%edx, %edx
319	jnz	L(exit)
320
321	pcmpeqb	%xmm1, %xmm3
322	pmovmskb %xmm3, %edx
323	lea	-16(%ecx), %ecx
324	test	%edx, %edx
325	jnz	L(exit)
326
327	pcmpeqb	-32(%eax), %xmm3
328	pmovmskb %xmm3, %edx
329	lea	-16(%ecx), %ecx
330	test	%edx, %edx
331	jnz	L(exit)
332
333	pcmpeqb	%xmm6, %xmm3
334	pmovmskb %xmm3, %edx
335	lea	-16(%ecx), %ecx
336L(exit):
337	sub	%ecx, %eax
338	test	%dl, %dl
339	jz	L(exit_high)
340
341	mov	%dl, %cl
342	and	$15, %cl
343	jz	L(exit_8)
344	test	$0x01, %dl
345	jnz	L(exit_tail0)
346	test	$0x02, %dl
347	jnz	L(exit_tail1)
348	test	$0x04, %dl
349	jnz	L(exit_tail2)
350	add	$3, %eax
351	RETURN
352
353	.p2align 4
354L(exit_8):
355	test	$0x10, %dl
356	jnz	L(exit_tail4)
357	test	$0x20, %dl
358	jnz	L(exit_tail5)
359	test	$0x40, %dl
360	jnz	L(exit_tail6)
361	add	$7, %eax
362	RETURN
363
364	.p2align 4
365L(exit_high):
366	mov	%dh, %ch
367	and	$15, %ch
368	jz	L(exit_high_8)
369	test	$0x01, %dh
370	jnz	L(exit_tail8)
371	test	$0x02, %dh
372	jnz	L(exit_tail9)
373	test	$0x04, %dh
374	jnz	L(exit_tail10)
375	add	$11, %eax
376	RETURN
377
378	.p2align 4
379L(exit_high_8):
380	test	$0x10, %dh
381	jnz	L(exit_tail12)
382	test	$0x20, %dh
383	jnz	L(exit_tail13)
384	test	$0x40, %dh
385	jnz	L(exit_tail14)
386	add	$15, %eax
387L(exit_tail0):
388	RETURN
389
390#ifdef USE_AS_STRNLEN
391
392	.p2align 4
393L(len_less64):
394	pxor	%xmm0, %xmm0
395	add	$64, %edi
396
397	pcmpeqb	(%eax), %xmm0
398	pmovmskb %xmm0, %edx
399	pxor	%xmm1, %xmm1
400	lea	16(%eax), %eax
401	test	%edx, %edx
402	jnz	L(strnlen_exit)
403
404	sub	$16, %edi
405	jbe	L(return_start_len)
406
407	pcmpeqb	(%eax), %xmm1
408	pmovmskb %xmm1, %edx
409	lea	16(%eax), %eax
410	test	%edx, %edx
411	jnz	L(strnlen_exit)
412
413	sub	$16, %edi
414	jbe	L(return_start_len)
415
416	pcmpeqb	(%eax), %xmm0
417	pmovmskb %xmm0, %edx
418	lea	16(%eax), %eax
419	test	%edx, %edx
420	jnz	L(strnlen_exit)
421
422	sub	$16, %edi
423	jbe	L(return_start_len)
424
425	pcmpeqb	(%eax), %xmm1
426	pmovmskb %xmm1, %edx
427	lea	16(%eax), %eax
428	test	%edx, %edx
429	jnz	L(strnlen_exit)
430
431#ifndef USE_AS_STRLCAT
432	movl	LEN(%esp), %eax
433	RETURN
434#else
435	jmp	L(return_start_len)
436#endif
437
438	.p2align 4
439L(strnlen_exit):
440	sub	%ecx, %eax
441
442	test	%dl, %dl
443	jz	L(strnlen_exit_high)
444	mov	%dl, %cl
445	and	$15, %cl
446	jz	L(strnlen_exit_8)
447	test	$0x01, %dl
448	jnz	L(exit_tail0)
449	test	$0x02, %dl
450	jnz	L(strnlen_exit_tail1)
451	test	$0x04, %dl
452	jnz	L(strnlen_exit_tail2)
453	sub	$4, %edi
454	jb	L(return_start_len)
455	lea	3(%eax), %eax
456	RETURN
457
458	.p2align 4
459L(strnlen_exit_8):
460	test	$0x10, %dl
461	jnz	L(strnlen_exit_tail4)
462	test	$0x20, %dl
463	jnz	L(strnlen_exit_tail5)
464	test	$0x40, %dl
465	jnz	L(strnlen_exit_tail6)
466	sub	$8, %edi
467	jb	L(return_start_len)
468	lea	7(%eax), %eax
469	RETURN
470
471	.p2align 4
472L(strnlen_exit_high):
473	mov	%dh, %ch
474	and	$15, %ch
475	jz	L(strnlen_exit_high_8)
476	test	$0x01, %dh
477	jnz	L(strnlen_exit_tail8)
478	test	$0x02, %dh
479	jnz	L(strnlen_exit_tail9)
480	test	$0x04, %dh
481	jnz	L(strnlen_exit_tail10)
482	sub	$12, %edi
483	jb	L(return_start_len)
484	lea	11(%eax), %eax
485	RETURN
486
487	.p2align 4
488L(strnlen_exit_high_8):
489	test	$0x10, %dh
490	jnz	L(strnlen_exit_tail12)
491	test	$0x20, %dh
492	jnz	L(strnlen_exit_tail13)
493	test	$0x40, %dh
494	jnz	L(strnlen_exit_tail14)
495	sub	$16, %edi
496	jb	L(return_start_len)
497	lea	15(%eax), %eax
498	RETURN
499
500	.p2align 4
501L(strnlen_exit_tail1):
502	sub	$2, %edi
503	jb	L(return_start_len)
504	lea	1(%eax), %eax
505	RETURN
506
507	.p2align 4
508L(strnlen_exit_tail2):
509	sub	$3, %edi
510	jb	L(return_start_len)
511	lea	2(%eax), %eax
512	RETURN
513
514	.p2align 4
515L(strnlen_exit_tail4):
516	sub	$5, %edi
517	jb	L(return_start_len)
518	lea	4(%eax), %eax
519	RETURN
520
521	.p2align 4
522L(strnlen_exit_tail5):
523	sub	$6, %edi
524	jb	L(return_start_len)
525	lea	5(%eax), %eax
526	RETURN
527
528	.p2align 4
529L(strnlen_exit_tail6):
530	sub	$7, %edi
531	jb	L(return_start_len)
532	lea	6(%eax), %eax
533	RETURN
534
535	.p2align 4
536L(strnlen_exit_tail8):
537	sub	$9, %edi
538	jb	L(return_start_len)
539	lea	8(%eax), %eax
540	RETURN
541
542	.p2align 4
543L(strnlen_exit_tail9):
544	sub	$10, %edi
545	jb	L(return_start_len)
546	lea	9(%eax), %eax
547	RETURN
548
549	.p2align 4
550L(strnlen_exit_tail10):
551	sub	$11, %edi
552	jb	L(return_start_len)
553	lea	10(%eax), %eax
554	RETURN
555
556	.p2align 4
557L(strnlen_exit_tail12):
558	sub	$13, %edi
559	jb	L(return_start_len)
560	lea	12(%eax), %eax
561	RETURN
562
563	.p2align 4
564L(strnlen_exit_tail13):
565	sub	$14, %edi
566	jb	L(return_start_len)
567	lea	13(%eax), %eax
568	RETURN
569
570	.p2align 4
571L(strnlen_exit_tail14):
572	sub	$15, %edi
573	jb	L(return_start_len)
574	lea	14(%eax), %eax
575	RETURN
576
577#ifndef USE_AS_STRLCAT
578	.p2align 4
579L(return_start_len):
580	movl	LEN(%esp), %eax
581	RETURN
582#endif
583
584/* for prolog only */
585
586	.p2align 4
587L(len_less4_prolog):
588	xor	%eax, %eax
589
590	add	$4, %edi
591	jz	L(exit_tail0)
592
593	cmpb	$0, (%edx)
594	jz	L(exit_tail0)
595	cmp	$1, %edi
596	je	L(exit_tail1)
597
598	cmpb	$0, 1(%edx)
599	jz	L(exit_tail1)
600	cmp	$2, %edi
601	je	L(exit_tail2)
602
603	cmpb	$0, 2(%edx)
604	jz	L(exit_tail2)
605	cmp	$3, %edi
606	je	L(exit_tail3)
607
608	cmpb	$0, 3(%edx)
609	jz	L(exit_tail3)
610	mov	%edi, %eax
611	RETURN
612
613	.p2align 4
614L(len_less8_prolog):
615	add	$4, %edi
616
617	cmpb	$0, 4(%edx)
618	jz	L(exit_tail4)
619	cmp	$1, %edi
620	je	L(exit_tail5)
621
622	cmpb	$0, 5(%edx)
623	jz	L(exit_tail5)
624	cmp	$2, %edi
625	je	L(exit_tail6)
626
627	cmpb	$0, 6(%edx)
628	jz	L(exit_tail6)
629	cmp	$3, %edi
630	je	L(exit_tail7)
631
632	cmpb	$0, 7(%edx)
633	jz	L(exit_tail7)
634	mov	$8, %eax
635	RETURN
636
637
638	.p2align 4
639L(len_less12_prolog):
640	add	$4, %edi
641
642	cmpb	$0, 8(%edx)
643	jz	L(exit_tail8)
644	cmp	$1, %edi
645	je	L(exit_tail9)
646
647	cmpb	$0, 9(%edx)
648	jz	L(exit_tail9)
649	cmp	$2, %edi
650	je	L(exit_tail10)
651
652	cmpb	$0, 10(%edx)
653	jz	L(exit_tail10)
654	cmp	$3, %edi
655	je	L(exit_tail11)
656
657	cmpb	$0, 11(%edx)
658	jz	L(exit_tail11)
659	mov	$12, %eax
660	RETURN
661
662	.p2align 4
663L(len_less16_prolog):
664	add	$4, %edi
665
666	cmpb	$0, 12(%edx)
667	jz	L(exit_tail12)
668	cmp	$1, %edi
669	je	L(exit_tail13)
670
671	cmpb	$0, 13(%edx)
672	jz	L(exit_tail13)
673	cmp	$2, %edi
674	je	L(exit_tail14)
675
676	cmpb	$0, 14(%edx)
677	jz	L(exit_tail14)
678	cmp	$3, %edi
679	je	L(exit_tail15)
680
681	cmpb	$0, 15(%edx)
682	jz	L(exit_tail15)
683	mov	$16, %eax
684	RETURN
685#endif
686
687	.p2align 4
688L(exit_tail1):
689	add	$1, %eax
690	RETURN
691
692L(exit_tail2):
693	add	$2, %eax
694	RETURN
695
696L(exit_tail3):
697	add	$3, %eax
698	RETURN
699
700L(exit_tail4):
701	add	$4, %eax
702	RETURN
703
704L(exit_tail5):
705	add	$5, %eax
706	RETURN
707
708L(exit_tail6):
709	add	$6, %eax
710	RETURN
711
712L(exit_tail7):
713	add	$7, %eax
714	RETURN
715
716L(exit_tail8):
717	add	$8, %eax
718	RETURN
719
720L(exit_tail9):
721	add	$9, %eax
722	RETURN
723
724L(exit_tail10):
725	add	$10, %eax
726	RETURN
727
728L(exit_tail11):
729	add	$11, %eax
730	RETURN
731
732L(exit_tail12):
733	add	$12, %eax
734	RETURN
735
736L(exit_tail13):
737	add	$13, %eax
738	RETURN
739
740L(exit_tail14):
741	add	$14, %eax
742	RETURN
743
744L(exit_tail15):
745	add	$15, %eax
746#ifndef USE_AS_STRCAT
747	RETURN
748END (STRLEN)
749#endif
750