1/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31/* Optimized strlcat with SSSE3 */
32
33#ifndef cfi_startproc
34# define cfi_startproc	.cfi_startproc
35#endif
36
37#ifndef cfi_endproc
38# define cfi_endproc	.cfi_endproc
39#endif
40
41#ifndef cfi_rel_offset
42# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
43#endif
44
45#ifndef cfi_restore
46# define cfi_restore(reg)	.cfi_restore reg
47#endif
48
49#ifndef cfi_adjust_cfa_offset
50# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
51#endif
52
53#ifndef ENTRY
54# define ENTRY(name)	\
55	.type name,  @function;	\
56	.globl name;	\
57	.p2align 4;	\
58name:	\
59	cfi_startproc
60#endif
61
62#ifndef END
63# define END(name)	\
64	cfi_endproc;	\
65	.size name, .-name
66#endif
67
68#define CFI_PUSH(REG)	\
69	cfi_adjust_cfa_offset (4);	\
70	cfi_rel_offset (REG, 0)
71
72#define CFI_POP(REG)	\
73	cfi_adjust_cfa_offset (-4);	\
74	cfi_restore (REG)
75
76#define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
77#define POP(REG)	popl	REG;	CFI_POP (REG)
78#define L(label)	.L##Prolog_##label
79
80#define DST	4
81#define SRC	DST+8
82#define LEN	SRC+4
83
84	.text
85ENTRY (strlcat_ssse3)
86	mov	DST(%esp), %edx
87	PUSH	(%ebx)
88	mov	LEN(%esp), %ebx
89	sub	$4, %ebx
90	jbe	L(len_less4_prolog)
91
92#define RETURN	jmp	L(StrcpyStep)
93#define edi	ebx
94
95#define USE_AS_STRNLEN
96#define USE_AS_STRCAT
97#define USE_AS_STRLCAT
98
99#include "sse2-strlen-atom.S"
100
101	.p2align 4
102L(StrcpyStep):
103
104#undef edi
105#undef L
106#define L(label) .L##label
107#undef RETURN
108#define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx);
109#define RETURN1	POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
110
111        movl	SRC(%esp), %ecx
112	movl	LEN(%esp), %ebx
113
114	cmp	%eax, %ebx
115	je	L(CalculateLengthOfSrcProlog)
116	sub	%eax, %ebx
117
118	test	%ebx, %ebx
119	jz	L(CalculateLengthOfSrcProlog)
120
121	mov	DST + 4(%esp), %edx
122
123	PUSH	(%edi)
124	add	%eax, %edx
125	mov	%ecx, %edi
126	sub	%eax, %edi
127
128	cmp	$8, %ebx
129	jbe	L(StrncpyExit8Bytes)
130
131	cmpb	$0, (%ecx)
132	jz	L(Exit1)
133	cmpb	$0, 1(%ecx)
134	jz	L(Exit2)
135	cmpb	$0, 2(%ecx)
136	jz	L(Exit3)
137	cmpb	$0, 3(%ecx)
138	jz	L(Exit4)
139	cmpb	$0, 4(%ecx)
140	jz	L(Exit5)
141	cmpb	$0, 5(%ecx)
142	jz	L(Exit6)
143	cmpb	$0, 6(%ecx)
144	jz	L(Exit7)
145	cmpb	$0, 7(%ecx)
146	jz	L(Exit8)
147	cmp	$16, %ebx
148	jb	L(StrncpyExit15Bytes)
149	cmpb	$0, 8(%ecx)
150	jz	L(Exit9)
151	cmpb	$0, 9(%ecx)
152	jz	L(Exit10)
153	cmpb	$0, 10(%ecx)
154	jz	L(Exit11)
155	cmpb	$0, 11(%ecx)
156	jz	L(Exit12)
157	cmpb	$0, 12(%ecx)
158	jz	L(Exit13)
159	cmpb	$0, 13(%ecx)
160	jz	L(Exit14)
161	cmpb	$0, 14(%ecx)
162	jz	L(Exit15)
163	cmpb	$0, 15(%ecx)
164	jz	L(Exit16)
165	cmp	$16, %ebx
166	je	L(StrlcpyExit16)
167
168#define USE_AS_STRNCPY
169#include "ssse3-strcpy-atom.S"
170
171	.p2align 4
172L(CopyFrom1To16Bytes):
173	add	%esi, %edx
174	add	%esi, %ecx
175
176	POP	(%esi)
177	test	%al, %al
178	jz	L(ExitHigh8)
179
180L(CopyFrom1To16BytesLess8):
181	mov	%al, %ah
182	and	$15, %ah
183	jz	L(ExitHigh4)
184
185	test	$0x01, %al
186	jnz	L(Exit1)
187	test	$0x02, %al
188	jnz	L(Exit2)
189	test	$0x04, %al
190	jnz	L(Exit3)
191L(Exit4):
192	movl	(%ecx), %eax
193	movl	%eax, (%edx)
194
195	lea	3(%ecx), %eax
196	sub	%edi, %eax
197	RETURN1
198
199	.p2align 4
200L(ExitHigh4):
201	test	$0x10, %al
202	jnz	L(Exit5)
203	test	$0x20, %al
204	jnz	L(Exit6)
205	test	$0x40, %al
206	jnz	L(Exit7)
207L(Exit8):
208	movlpd	(%ecx), %xmm0
209	movlpd	%xmm0, (%edx)
210
211	lea	7(%ecx), %eax
212	sub	%edi, %eax
213	RETURN1
214
215	.p2align 4
216L(ExitHigh8):
217	mov	%ah, %al
218	and	$15, %al
219	jz	L(ExitHigh12)
220
221	test	$0x01, %ah
222	jnz	L(Exit9)
223	test	$0x02, %ah
224	jnz	L(Exit10)
225	test	$0x04, %ah
226	jnz	L(Exit11)
227L(Exit12):
228	movlpd	(%ecx), %xmm0
229	movlpd	%xmm0, (%edx)
230	movl	8(%ecx), %eax
231	movl	%eax, 8(%edx)
232
233	lea	11(%ecx), %eax
234	sub	%edi, %eax
235	RETURN1
236
237	.p2align 4
238L(ExitHigh12):
239	test	$0x10, %ah
240	jnz	L(Exit13)
241	test	$0x20, %ah
242	jnz	L(Exit14)
243	test	$0x40, %ah
244	jnz	L(Exit15)
245L(Exit16):
246	movlpd	(%ecx), %xmm0
247	movlpd	8(%ecx), %xmm1
248	movlpd	%xmm0, (%edx)
249	movlpd	%xmm1, 8(%edx)
250
251	lea	15(%ecx), %eax
252	sub	%edi, %eax
253	RETURN1
254
255	CFI_PUSH(%esi)
256
257	.p2align 4
258L(CopyFrom1To16BytesCase2):
259	add	$16, %ebx
260	add	%esi, %ecx
261	add	%esi, %edx
262
263	POP	(%esi)
264
265	test	%al, %al
266	jz	L(ExitHighCase2)
267
268	cmp	$8, %ebx
269	ja	L(CopyFrom1To16BytesLess8)
270
271	test	$0x01, %al
272	jnz	L(Exit1)
273	cmp	$1, %ebx
274	je	L(StrlcpyExit1)
275	test	$0x02, %al
276	jnz	L(Exit2)
277	cmp	$2, %ebx
278	je	L(StrlcpyExit2)
279	test	$0x04, %al
280	jnz	L(Exit3)
281	cmp	$3, %ebx
282	je	L(StrlcpyExit3)
283	test	$0x08, %al
284	jnz	L(Exit4)
285	cmp	$4, %ebx
286	je	L(StrlcpyExit4)
287	test	$0x10, %al
288	jnz	L(Exit5)
289	cmp	$5, %ebx
290	je	L(StrlcpyExit5)
291	test	$0x20, %al
292	jnz	L(Exit6)
293	cmp	$6, %ebx
294	je	L(StrlcpyExit6)
295	test	$0x40, %al
296	jnz	L(Exit7)
297	cmp	$7, %ebx
298	je	L(StrlcpyExit7)
299	test	$0x80, %al
300	jnz	L(Exit8)
301	jmp	L(StrlcpyExit8)
302
303	.p2align 4
304L(ExitHighCase2):
305	cmp	$8, %ebx
306	jbe	L(CopyFrom1To16BytesLess8Case3)
307
308	test	$0x01, %ah
309	jnz	L(Exit9)
310	cmp	$9, %ebx
311	je	L(StrlcpyExit9)
312	test	$0x02, %ah
313	jnz	L(Exit10)
314	cmp	$10, %ebx
315	je	L(StrlcpyExit10)
316	test	$0x04, %ah
317	jnz	L(Exit11)
318	cmp	$11, %ebx
319	je	L(StrlcpyExit11)
320	test	$0x8, %ah
321	jnz	L(Exit12)
322	cmp	$12, %ebx
323	je	L(StrlcpyExit12)
324	test	$0x10, %ah
325	jnz	L(Exit13)
326	cmp	$13, %ebx
327	je	L(StrlcpyExit13)
328	test	$0x20, %ah
329	jnz	L(Exit14)
330	cmp	$14, %ebx
331	je	L(StrlcpyExit14)
332	test	$0x40, %ah
333	jnz	L(Exit15)
334	cmp	$15, %ebx
335	je	L(StrlcpyExit15)
336	test	$0x80, %ah
337	jnz	L(Exit16)
338	jmp	L(StrlcpyExit16)
339
340	CFI_PUSH(%esi)
341
342	.p2align 4
343L(CopyFrom1To16BytesCase2OrCase3):
344	test	%eax, %eax
345	jnz	L(CopyFrom1To16BytesCase2)
346
347	.p2align 4
348L(CopyFrom1To16BytesCase3):
349	add	$16, %ebx
350	add	%esi, %edx
351	add	%esi, %ecx
352
353	POP	(%esi)
354
355	cmp	$8, %ebx
356	ja	L(ExitHigh8Case3)
357
358L(CopyFrom1To16BytesLess8Case3):
359	cmp	$4, %ebx
360	ja	L(ExitHigh4Case3)
361
362	cmp	$1, %ebx
363	je	L(StrlcpyExit1)
364	cmp	$2, %ebx
365	je	L(StrlcpyExit2)
366	cmp	$3, %ebx
367	je	L(StrlcpyExit3)
368L(StrlcpyExit4):
369	movb	%bh, 3(%edx)
370	movw	(%ecx), %ax
371	movw	%ax, (%edx)
372	movb	2(%ecx), %al
373	movb	%al, 2(%edx)
374
375	lea	4(%ecx), %edx
376	mov	%edi, %ecx
377	POP	(%edi)
378	jmp	L(CalculateLengthOfSrc)
379        CFI_PUSH     (%edi)
380
381	.p2align 4
382L(ExitHigh4Case3):
383	cmp	$5, %ebx
384	je	L(StrlcpyExit5)
385	cmp	$6, %ebx
386	je	L(StrlcpyExit6)
387	cmp	$7, %ebx
388	je	L(StrlcpyExit7)
389L(StrlcpyExit8):
390	movb	%bh, 7(%edx)
391	movl	(%ecx), %eax
392	movl	%eax, (%edx)
393	movl	3(%ecx), %eax
394	movl	%eax, 3(%edx)
395
396	lea	8(%ecx), %edx
397	mov	%edi, %ecx
398	POP	(%edi)
399	jmp	L(CalculateLengthOfSrc)
400        CFI_PUSH     (%edi)
401
402	.p2align 4
403L(ExitHigh8Case3):
404	cmp	$12, %ebx
405	ja	L(ExitHigh12Case3)
406
407	cmp	$9, %ebx
408	je	L(StrlcpyExit9)
409	cmp	$10, %ebx
410	je	L(StrlcpyExit10)
411	cmp	$11, %ebx
412	je	L(StrlcpyExit11)
413L(StrlcpyExit12):
414	movb	%bh, 11(%edx)
415	movlpd	(%ecx), %xmm0
416	movlpd	%xmm0, (%edx)
417	movl	7(%ecx), %eax
418	movl	%eax, 7(%edx)
419
420	lea	12(%ecx), %edx
421	mov	%edi, %ecx
422	POP	(%edi)
423	jmp	L(CalculateLengthOfSrc)
424        CFI_PUSH     (%edi)
425
426	.p2align 4
427L(ExitHigh12Case3):
428	cmp	$13, %ebx
429	je	L(StrlcpyExit13)
430	cmp	$14, %ebx
431	je	L(StrlcpyExit14)
432	cmp	$15, %ebx
433	je	L(StrlcpyExit15)
434L(StrlcpyExit16):
435	movb	%bh, 15(%edx)
436	movlpd	(%ecx), %xmm0
437	movlpd	%xmm0, (%edx)
438	movlpd	7(%ecx), %xmm0
439	movlpd	%xmm0, 7(%edx)
440
441	lea	16(%ecx), %edx
442	mov	%edi, %ecx
443	POP	(%edi)
444	jmp	L(CalculateLengthOfSrc)
445        CFI_PUSH     (%edi)
446
447	.p2align 4
448L(StrlcpyExit1):
449	movb	%bh, (%edx)
450
451	lea	1(%ecx), %edx
452	mov	%edi, %ecx
453	POP	(%edi)
454	jmp	L(CalculateLengthOfSrc)
455        CFI_PUSH     (%edi)
456
457	.p2align 4
458L(Exit1):
459	movb	(%ecx), %al
460	movb	%al, (%edx)
461
462	mov	%ecx, %eax
463	sub	%edi, %eax
464	RETURN1
465
466	.p2align 4
467L(StrlcpyExit2):
468	movb	%bh, 1(%edx)
469	movb	(%ecx), %al
470	movb	%al, (%edx)
471
472	lea	2(%ecx), %edx
473	mov	%edi, %ecx
474	POP	(%edi)
475	jmp	L(CalculateLengthOfSrc)
476        CFI_PUSH     (%edi)
477
478	.p2align 4
479L(Exit2):
480	movw	(%ecx), %ax
481	movw	%ax, (%edx)
482	movl	%edi, %eax
483
484	lea	1(%ecx), %eax
485	sub	%edi, %eax
486	RETURN1
487
488	.p2align 4
489L(StrlcpyExit3):
490	movb	%bh, 2(%edx)
491	movw	(%ecx), %ax
492	movw	%ax, (%edx)
493
494	lea	3(%ecx), %edx
495	mov	%edi, %ecx
496	POP	(%edi)
497	jmp	L(CalculateLengthOfSrc)
498        CFI_PUSH     (%edi)
499
500	.p2align 4
501L(Exit3):
502	movw	(%ecx), %ax
503	movw	%ax, (%edx)
504	movb	2(%ecx), %al
505	movb	%al, 2(%edx)
506
507	lea	2(%ecx), %eax
508	sub	%edi, %eax
509	RETURN1
510
511	.p2align 4
512L(StrlcpyExit5):
513	movb	%bh, 4(%edx)
514	movl	(%ecx), %eax
515	movl	%eax, (%edx)
516	movl	%edi, %eax
517
518	lea	5(%ecx), %edx
519	mov	%edi, %ecx
520	POP	(%edi)
521	jmp	L(CalculateLengthOfSrc)
522        CFI_PUSH     (%edi)
523
524	.p2align 4
525L(Exit5):
526	movl	(%ecx), %eax
527	movl	%eax, (%edx)
528	movb	4(%ecx), %al
529	movb	%al, 4(%edx)
530
531	lea	4(%ecx), %eax
532	sub	%edi, %eax
533	RETURN1
534
535	.p2align 4
536L(StrlcpyExit6):
537	movb	%bh, 5(%edx)
538	movl	(%ecx), %eax
539	movl	%eax, (%edx)
540	movb	4(%ecx), %al
541	movb	%al, 4(%edx)
542
543	lea	6(%ecx), %edx
544	mov	%edi, %ecx
545	POP	(%edi)
546	jmp	L(CalculateLengthOfSrc)
547        CFI_PUSH     (%edi)
548
549	.p2align 4
550L(Exit6):
551	movl	(%ecx), %eax
552	movl	%eax, (%edx)
553	movw	4(%ecx), %ax
554	movw	%ax, 4(%edx)
555
556	lea	5(%ecx), %eax
557	sub	%edi, %eax
558	RETURN1
559
560	.p2align 4
561L(StrlcpyExit7):
562	movb	%bh, 6(%edx)
563	movl	(%ecx), %eax
564	movl	%eax, (%edx)
565	movw	4(%ecx), %ax
566	movw	%ax, 4(%edx)
567
568	lea	7(%ecx), %edx
569	mov	%edi, %ecx
570	POP	(%edi)
571	jmp	L(CalculateLengthOfSrc)
572        CFI_PUSH     (%edi)
573
574	.p2align 4
575L(Exit7):
576	movl	(%ecx), %eax
577	movl	%eax, (%edx)
578	movl	3(%ecx), %eax
579	movl	%eax, 3(%edx)
580
581	lea	6(%ecx), %eax
582	sub	%edi, %eax
583	RETURN1
584
585	.p2align 4
586L(StrlcpyExit9):
587	movb	%bh, 8(%edx)
588	movlpd	(%ecx), %xmm0
589	movlpd	%xmm0, (%edx)
590
591	lea	9(%ecx), %edx
592	mov	%edi, %ecx
593	POP	(%edi)
594	jmp	L(CalculateLengthOfSrc)
595        CFI_PUSH     (%edi)
596
597	.p2align 4
598L(Exit9):
599	movlpd	(%ecx), %xmm0
600	movlpd	%xmm0, (%edx)
601	movb	8(%ecx), %al
602	movb	%al, 8(%edx)
603
604	lea	8(%ecx), %eax
605	sub	%edi, %eax
606	RETURN1
607
608	.p2align 4
609L(StrlcpyExit10):
610	movb	%bh, 9(%edx)
611	movlpd	(%ecx), %xmm0
612	movlpd	%xmm0, (%edx)
613	movb	8(%ecx), %al
614	movb	%al, 8(%edx)
615
616	lea	10(%ecx), %edx
617	mov	%edi, %ecx
618	POP	(%edi)
619	jmp	L(CalculateLengthOfSrc)
620        CFI_PUSH     (%edi)
621
622	.p2align 4
623L(Exit10):
624	movlpd	(%ecx), %xmm0
625	movlpd	%xmm0, (%edx)
626	movw	8(%ecx), %ax
627	movw	%ax, 8(%edx)
628
629	lea	9(%ecx), %eax
630	sub	%edi, %eax
631	RETURN1
632
633	.p2align 4
634L(StrlcpyExit11):
635	movb	%bh, 10(%edx)
636	movlpd	(%ecx), %xmm0
637	movlpd	%xmm0, (%edx)
638	movw	8(%ecx), %ax
639	movw	%ax, 8(%edx)
640
641	lea	11(%ecx), %edx
642	mov	%edi, %ecx
643	POP	(%edi)
644	jmp	L(CalculateLengthOfSrc)
645        CFI_PUSH     (%edi)
646
647	.p2align 4
648L(Exit11):
649	movlpd	(%ecx), %xmm0
650	movlpd	%xmm0, (%edx)
651	movl	7(%ecx), %eax
652	movl	%eax, 7(%edx)
653
654	lea	10(%ecx), %eax
655	sub	%edi, %eax
656	RETURN1
657
658	.p2align 4
659L(StrlcpyExit13):
660	movb	%bh, 12(%edx)
661	movlpd	(%ecx), %xmm0
662	movlpd	%xmm0, (%edx)
663	movl	8(%ecx), %eax
664	movl	%eax, 8(%edx)
665
666	lea	13(%ecx), %edx
667	mov	%edi, %ecx
668	POP	(%edi)
669	jmp	L(CalculateLengthOfSrc)
670        CFI_PUSH     (%edi)
671
672	.p2align 4
673L(Exit13):
674	movlpd	(%ecx), %xmm0
675	movlpd	%xmm0, (%edx)
676	movlpd	5(%ecx), %xmm0
677	movlpd	%xmm0, 5(%edx)
678
679	lea	12(%ecx), %eax
680	sub	%edi, %eax
681	RETURN1
682
683	.p2align 4
684L(StrlcpyExit14):
685	movb	%bh, 13(%edx)
686	movlpd	(%ecx), %xmm0
687	movlpd	%xmm0, (%edx)
688	movlpd	5(%ecx), %xmm0
689	movlpd	%xmm0, 5(%edx)
690
691	lea	14(%ecx), %edx
692	mov	%edi, %ecx
693	POP	(%edi)
694	jmp	L(CalculateLengthOfSrc)
695        CFI_PUSH     (%edi)
696
697	.p2align 4
698L(Exit14):
699	movlpd	(%ecx), %xmm0
700	movlpd	%xmm0, (%edx)
701	movlpd	6(%ecx), %xmm0
702	movlpd	%xmm0, 6(%edx)
703
704	lea	13(%ecx), %eax
705	sub	%edi, %eax
706	RETURN1
707
708	.p2align 4
709L(StrlcpyExit15):
710	movb	%bh, 14(%edx)
711	movlpd	(%ecx), %xmm0
712	movlpd	%xmm0, (%edx)
713	movlpd	6(%ecx), %xmm0
714	movlpd	%xmm0, 6(%edx)
715
716	lea	15(%ecx), %edx
717	mov	%edi, %ecx
718	POP	(%edi)
719	jmp	L(CalculateLengthOfSrc)
720        CFI_PUSH     (%edi)
721
722	.p2align 4
723L(Exit15):
724	movlpd	(%ecx), %xmm0
725	movlpd	%xmm0, (%edx)
726	movlpd	7(%ecx), %xmm0
727	movlpd	%xmm0, 7(%edx)
728
729	lea	14(%ecx), %eax
730	sub	%edi, %eax
731	RETURN1
732
733	.p2align 4
734L(StrncpyExit15Bytes):
735	cmp	$12, %ebx
736	ja	L(StrncpyExit15Bytes1)
737
738	cmpb	$0, 8(%ecx)
739	jz	L(Exit9)
740	cmp	$9, %ebx
741	je	L(StrlcpyExit9)
742
743	cmpb	$0, 9(%ecx)
744	jz	L(Exit10)
745	cmp	$10, %ebx
746	je	L(StrlcpyExit10)
747
748	cmpb	$0, 10(%ecx)
749	jz	L(Exit11)
750	cmp	$11, %ebx
751	je	L(StrlcpyExit11)
752
753	cmpb	$0, 11(%ecx)
754	jz	L(Exit12)
755	jmp	L(StrlcpyExit12)
756
757	.p2align 4
758L(StrncpyExit15Bytes1):
759	cmpb	$0, 8(%ecx)
760	jz	L(Exit9)
761	cmpb	$0, 9(%ecx)
762	jz	L(Exit10)
763	cmpb	$0, 10(%ecx)
764	jz	L(Exit11)
765	cmpb	$0, 11(%ecx)
766	jz	L(Exit12)
767
768	cmpb	$0, 12(%ecx)
769	jz	L(Exit13)
770	cmp	$13, %ebx
771	je	L(StrlcpyExit13)
772
773	cmpb	$0, 13(%ecx)
774	jz	L(Exit14)
775	cmp	$14, %ebx
776	je	L(StrlcpyExit14)
777
778	cmpb	$0, 14(%ecx)
779	jz	L(Exit15)
780	jmp	L(StrlcpyExit15)
781
782	.p2align 4
783L(StrncpyExit8Bytes):
784	cmp	$4, %ebx
785	ja	L(StrncpyExit8Bytes1)
786
787	cmpb	$0, (%ecx)
788	jz	L(Exit1)
789	cmp	$1, %ebx
790	je	L(StrlcpyExit1)
791
792	cmpb	$0, 1(%ecx)
793	jz	L(Exit2)
794	cmp	$2, %ebx
795	je	L(StrlcpyExit2)
796
797	cmpb	$0, 2(%ecx)
798	jz	L(Exit3)
799	cmp	$3, %ebx
800	je	L(StrlcpyExit3)
801
802	cmpb	$0, 3(%ecx)
803	jz	L(Exit4)
804	jmp	L(StrlcpyExit4)
805
806	.p2align 4
807L(StrncpyExit8Bytes1):
808	cmpb	$0, (%ecx)
809	jz	L(Exit1)
810	cmpb	$0, 1(%ecx)
811	jz	L(Exit2)
812	cmpb	$0, 2(%ecx)
813	jz	L(Exit3)
814	cmpb	$0, 3(%ecx)
815	jz	L(Exit4)
816
817	cmpb	$0, 4(%ecx)
818	jz	L(Exit5)
819	cmp	$5, %ebx
820	je	L(StrlcpyExit5)
821
822	cmpb	$0, 5(%ecx)
823	jz	L(Exit6)
824	cmp	$6, %ebx
825	je	L(StrlcpyExit6)
826
827	cmpb	$0, 6(%ecx)
828	jz	L(Exit7)
829	cmp	$7, %ebx
830	je	L(StrlcpyExit7)
831
832	cmpb	$0, 7(%ecx)
833	jz	L(Exit8)
834	jmp	L(StrlcpyExit8)
835
836	CFI_POP	(%edi)
837
838
839	.p2align 4
840L(Prolog_return_start_len):
841	movl	LEN(%esp), %ebx
842        movl	SRC(%esp), %ecx
843L(CalculateLengthOfSrcProlog):
844	mov	%ecx, %edx
845	sub	%ebx, %ecx
846
847	.p2align 4
848L(CalculateLengthOfSrc):
849	cmpb	$0, (%edx)
850	jz	L(exit_tail0)
851	cmpb	$0, 1(%edx)
852	jz	L(exit_tail1)
853	cmpb	$0, 2(%edx)
854	jz	L(exit_tail2)
855	cmpb	$0, 3(%edx)
856	jz	L(exit_tail3)
857
858	cmpb	$0, 4(%edx)
859	jz	L(exit_tail4)
860	cmpb	$0, 5(%edx)
861	jz	L(exit_tail5)
862	cmpb	$0, 6(%edx)
863	jz	L(exit_tail6)
864	cmpb	$0, 7(%edx)
865	jz	L(exit_tail7)
866
867	cmpb	$0, 8(%edx)
868	jz	L(exit_tail8)
869	cmpb	$0, 9(%edx)
870	jz	L(exit_tail9)
871	cmpb	$0, 10(%edx)
872	jz	L(exit_tail10)
873	cmpb	$0, 11(%edx)
874	jz	L(exit_tail11)
875
876	cmpb	$0, 12(%edx)
877	jz	L(exit_tail12)
878	cmpb	$0, 13(%edx)
879	jz	L(exit_tail13)
880	cmpb	$0, 14(%edx)
881	jz	L(exit_tail14)
882	cmpb	$0, 15(%edx)
883	jz	L(exit_tail15)
884
885	pxor	%xmm0, %xmm0
886	lea	16(%edx), %eax
887	add	$16, %ecx
888	and	$-16, %eax
889
890	pcmpeqb	(%eax), %xmm0
891	pmovmskb %xmm0, %edx
892	pxor	%xmm1, %xmm1
893	lea	16(%eax), %eax
894	test	%edx, %edx
895	jnz	L(exit)
896
897	pcmpeqb	(%eax), %xmm1
898	pmovmskb %xmm1, %edx
899	pxor	%xmm2, %xmm2
900	lea	16(%eax), %eax
901	test	%edx, %edx
902	jnz	L(exit)
903
904	pcmpeqb	(%eax), %xmm2
905	pmovmskb %xmm2, %edx
906	pxor	%xmm3, %xmm3
907	lea	16(%eax), %eax
908	test	%edx, %edx
909	jnz	L(exit)
910
911	pcmpeqb	(%eax), %xmm3
912	pmovmskb %xmm3, %edx
913	lea	16(%eax), %eax
914	test	%edx, %edx
915	jnz	L(exit)
916
917	pcmpeqb	(%eax), %xmm0
918	pmovmskb %xmm0, %edx
919	lea	16(%eax), %eax
920	test	%edx, %edx
921	jnz	L(exit)
922
923	pcmpeqb	(%eax), %xmm1
924	pmovmskb %xmm1, %edx
925	lea	16(%eax), %eax
926	test	%edx, %edx
927	jnz	L(exit)
928
929	pcmpeqb	(%eax), %xmm2
930	pmovmskb %xmm2, %edx
931	lea	16(%eax), %eax
932	test	%edx, %edx
933	jnz	L(exit)
934
935	pcmpeqb	(%eax), %xmm3
936	pmovmskb %xmm3, %edx
937	lea	16(%eax), %eax
938	test	%edx, %edx
939	jnz	L(exit)
940
941	pcmpeqb	(%eax), %xmm0
942	pmovmskb %xmm0, %edx
943	lea	16(%eax), %eax
944	test	%edx, %edx
945	jnz	L(exit)
946
947	pcmpeqb	(%eax), %xmm1
948	pmovmskb %xmm1, %edx
949	lea	16(%eax), %eax
950	test	%edx, %edx
951	jnz	L(exit)
952
953	pcmpeqb	(%eax), %xmm2
954	pmovmskb %xmm2, %edx
955	lea	16(%eax), %eax
956	test	%edx, %edx
957	jnz	L(exit)
958
959	pcmpeqb	(%eax), %xmm3
960	pmovmskb %xmm3, %edx
961	lea	16(%eax), %eax
962	test	%edx, %edx
963	jnz	L(exit)
964
965	pcmpeqb	(%eax), %xmm0
966	pmovmskb %xmm0, %edx
967	lea	16(%eax), %eax
968	test	%edx, %edx
969	jnz	L(exit)
970
971	pcmpeqb	(%eax), %xmm1
972	pmovmskb %xmm1, %edx
973	lea	16(%eax), %eax
974	test	%edx, %edx
975	jnz	L(exit)
976
977	pcmpeqb	(%eax), %xmm2
978	pmovmskb %xmm2, %edx
979	lea	16(%eax), %eax
980	test	%edx, %edx
981	jnz	L(exit)
982
983	pcmpeqb	(%eax), %xmm3
984	pmovmskb %xmm3, %edx
985	lea	16(%eax), %eax
986	test	%edx, %edx
987	jnz	L(exit)
988
989	and	$-0x40, %eax
990
991	.p2align 4
992L(aligned_64_loop):
993	movaps	(%eax), %xmm0
994	movaps	16(%eax), %xmm1
995	movaps	32(%eax), %xmm2
996	movaps	48(%eax), %xmm6
997	pminub	%xmm1, %xmm0
998	pminub	%xmm6, %xmm2
999	pminub	%xmm0, %xmm2
1000	pcmpeqb	%xmm3, %xmm2
1001	pmovmskb %xmm2, %edx
1002	lea	64(%eax), %eax
1003	test	%edx, %edx
1004	jz	L(aligned_64_loop)
1005
1006	pcmpeqb	-64(%eax), %xmm3
1007	pmovmskb %xmm3, %edx
1008	lea	48(%ecx), %ecx
1009	test	%edx, %edx
1010	jnz	L(exit)
1011
1012	pcmpeqb	%xmm1, %xmm3
1013	pmovmskb %xmm3, %edx
1014	lea	-16(%ecx), %ecx
1015	test	%edx, %edx
1016	jnz	L(exit)
1017
1018	pcmpeqb	-32(%eax), %xmm3
1019	pmovmskb %xmm3, %edx
1020	lea	-16(%ecx), %ecx
1021	test	%edx, %edx
1022	jnz	L(exit)
1023
1024	pcmpeqb	%xmm6, %xmm3
1025	pmovmskb %xmm3, %edx
1026	lea	-16(%ecx), %ecx
1027
1028	.p2align 4
1029L(exit):
1030	sub	%ecx, %eax
1031	test	%dl, %dl
1032	jz	L(exit_more_8)
1033
1034	mov	%dl, %cl
1035	and	$15, %cl
1036	jz	L(exit_more_4)
1037	test	$0x01, %dl
1038	jnz	L(exit_0)
1039	test	$0x02, %dl
1040	jnz	L(exit_1)
1041	test	$0x04, %dl
1042	jnz	L(exit_2)
1043	add	$3, %eax
1044	RETURN
1045
1046	.p2align 4
1047L(exit_more_4):
1048	test	$0x10, %dl
1049	jnz	L(exit_4)
1050	test	$0x20, %dl
1051	jnz	L(exit_5)
1052	test	$0x40, %dl
1053	jnz	L(exit_6)
1054	add	$7, %eax
1055	RETURN
1056
1057	.p2align 4
1058L(exit_more_8):
1059	mov	%dh, %ch
1060	and	$15, %ch
1061	jz	L(exit_more_12)
1062	test	$0x01, %dh
1063	jnz	L(exit_8)
1064	test	$0x02, %dh
1065	jnz	L(exit_9)
1066	test	$0x04, %dh
1067	jnz	L(exit_10)
1068	add	$11, %eax
1069	RETURN
1070
1071	.p2align 4
1072L(exit_more_12):
1073	test	$0x10, %dh
1074	jnz	L(exit_12)
1075	test	$0x20, %dh
1076	jnz	L(exit_13)
1077	test	$0x40, %dh
1078	jnz	L(exit_14)
1079	add	$15, %eax
1080L(exit_0):
1081	RETURN
1082
1083	.p2align 4
1084L(exit_1):
1085	add	$1, %eax
1086	RETURN
1087
1088L(exit_2):
1089	add	$2, %eax
1090	RETURN
1091
1092L(exit_3):
1093	add	$3, %eax
1094	RETURN
1095
1096L(exit_4):
1097	add	$4, %eax
1098	RETURN
1099
1100L(exit_5):
1101	add	$5, %eax
1102	RETURN
1103
1104L(exit_6):
1105	add	$6, %eax
1106	RETURN
1107
1108L(exit_7):
1109	add	$7, %eax
1110	RETURN
1111
1112L(exit_8):
1113	add	$8, %eax
1114	RETURN
1115
1116L(exit_9):
1117	add	$9, %eax
1118	RETURN
1119
1120L(exit_10):
1121	add	$10, %eax
1122	RETURN
1123
1124L(exit_11):
1125	add	$11, %eax
1126	RETURN
1127
1128L(exit_12):
1129	add	$12, %eax
1130	RETURN
1131
1132L(exit_13):
1133	add	$13, %eax
1134	RETURN
1135
1136L(exit_14):
1137	add	$14, %eax
1138	RETURN
1139
1140L(exit_15):
1141	add	$15, %eax
1142	RETURN
1143
1144L(exit_tail0):
1145	mov	%edx, %eax
1146	sub	%ecx, %eax
1147	RETURN
1148
1149	.p2align 4
1150L(exit_tail1):
1151	lea	1(%edx), %eax
1152	sub	%ecx, %eax
1153	RETURN
1154
1155L(exit_tail2):
1156	lea	2(%edx), %eax
1157	sub	%ecx, %eax
1158	RETURN
1159
1160L(exit_tail3):
1161	lea	3(%edx), %eax
1162	sub	%ecx, %eax
1163	RETURN
1164
1165L(exit_tail4):
1166	lea	4(%edx), %eax
1167	sub	%ecx, %eax
1168	RETURN
1169
1170L(exit_tail5):
1171	lea	5(%edx), %eax
1172	sub	%ecx, %eax
1173	RETURN
1174
1175L(exit_tail6):
1176	lea	6(%edx), %eax
1177	sub	%ecx, %eax
1178	RETURN
1179
1180L(exit_tail7):
1181	lea	7(%edx), %eax
1182	sub	%ecx, %eax
1183	RETURN
1184
1185L(exit_tail8):
1186	lea	8(%edx), %eax
1187	sub	%ecx, %eax
1188	RETURN
1189
1190L(exit_tail9):
1191	lea	9(%edx), %eax
1192	sub	%ecx, %eax
1193	RETURN
1194
1195L(exit_tail10):
1196	lea	10(%edx), %eax
1197	sub	%ecx, %eax
1198	RETURN
1199
1200L(exit_tail11):
1201	lea	11(%edx), %eax
1202	sub	%ecx, %eax
1203	RETURN
1204
1205L(exit_tail12):
1206	lea	12(%edx), %eax
1207	sub	%ecx, %eax
1208	RETURN
1209
1210L(exit_tail13):
1211	lea	13(%edx), %eax
1212	sub	%ecx, %eax
1213	RETURN
1214
1215L(exit_tail14):
1216	lea	14(%edx), %eax
1217	sub	%ecx, %eax
1218	RETURN
1219
1220L(exit_tail15):
1221	lea	15(%edx), %eax
1222	sub	%ecx, %eax
1223	RETURN
1224
1225END (strlcat)
1226