1/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#define USE_AS_STRNCPY
32#define STRCPY strlcpy_ssse3
33#define STRLEN strlcpy_ssse3
34#define USE_AS_STRLCPY
35#include "ssse3-strcpy-atom.S"
36
37	.p2align 4
38L(CopyFrom1To16Bytes):
39	add	%esi, %edx
40	add	%esi, %ecx
41
42	POP	(%esi)
43	test	%al, %al
44	jz	L(ExitHigh8)
45
46L(CopyFrom1To16BytesLess8):
47	mov	%al, %ah
48	and	$15, %ah
49	jz	L(ExitHigh4)
50
51	test	$0x01, %al
52	jnz	L(Exit1)
53	test	$0x02, %al
54	jnz	L(Exit2)
55	test	$0x04, %al
56	jnz	L(Exit3)
57L(Exit4):
58	movl	(%ecx), %eax
59	movl	%eax, (%edx)
60
61	lea	3(%ecx), %eax
62	sub	%edi, %eax
63	RETURN1
64
65	.p2align 4
66L(ExitHigh4):
67	test	$0x10, %al
68	jnz	L(Exit5)
69	test	$0x20, %al
70	jnz	L(Exit6)
71	test	$0x40, %al
72	jnz	L(Exit7)
73L(Exit8):
74	movlpd	(%ecx), %xmm0
75	movlpd	%xmm0, (%edx)
76
77	lea	7(%ecx), %eax
78	sub	%edi, %eax
79	RETURN1
80
81	.p2align 4
82L(ExitHigh8):
83	mov	%ah, %al
84	and	$15, %al
85	jz	L(ExitHigh12)
86
87	test	$0x01, %ah
88	jnz	L(Exit9)
89	test	$0x02, %ah
90	jnz	L(Exit10)
91	test	$0x04, %ah
92	jnz	L(Exit11)
93L(Exit12):
94	movlpd	(%ecx), %xmm0
95	movlpd	%xmm0, (%edx)
96	movl	8(%ecx), %eax
97	movl	%eax, 8(%edx)
98
99	lea	11(%ecx), %eax
100	sub	%edi, %eax
101	RETURN1
102
103	.p2align 4
104L(ExitHigh12):
105	test	$0x10, %ah
106	jnz	L(Exit13)
107	test	$0x20, %ah
108	jnz	L(Exit14)
109	test	$0x40, %ah
110	jnz	L(Exit15)
111L(Exit16):
112	movlpd	(%ecx), %xmm0
113	movlpd	8(%ecx), %xmm1
114	movlpd	%xmm0, (%edx)
115	movlpd	%xmm1, 8(%edx)
116
117	lea	15(%ecx), %eax
118	sub	%edi, %eax
119	RETURN1
120
121	CFI_PUSH(%esi)
122
123	.p2align 4
124L(CopyFrom1To16BytesCase2):
125	add	$16, %ebx
126	add	%esi, %ecx
127        add     %esi, %edx
128
129	POP	(%esi)
130
131        test    %al, %al
132        jz      L(ExitHighCase2)
133
134        cmp     $8, %ebx
135        ja      L(CopyFrom1To16BytesLess8)
136
137	test	$0x01, %al
138	jnz	L(Exit1)
139	cmp	$1, %ebx
140	je	L(StrlcpyExit1)
141	test	$0x02, %al
142	jnz	L(Exit2)
143	cmp	$2, %ebx
144	je	L(StrlcpyExit2)
145	test	$0x04, %al
146	jnz	L(Exit3)
147	cmp	$3, %ebx
148	je	L(StrlcpyExit3)
149	test	$0x08, %al
150	jnz	L(Exit4)
151	cmp	$4, %ebx
152	je	L(StrlcpyExit4)
153	test	$0x10, %al
154	jnz	L(Exit5)
155	cmp	$5, %ebx
156	je	L(StrlcpyExit5)
157	test	$0x20, %al
158	jnz	L(Exit6)
159	cmp	$6, %ebx
160	je	L(StrlcpyExit6)
161	test	$0x40, %al
162	jnz	L(Exit7)
163	cmp	$7, %ebx
164	je	L(StrlcpyExit7)
165	test	$0x80, %al
166	jnz	L(Exit8)
167	jmp	L(StrlcpyExit8)
168
169	.p2align 4
170L(ExitHighCase2):
171        cmp     $8, %ebx
172        jbe      L(CopyFrom1To16BytesLess8Case3)
173
174	test	$0x01, %ah
175	jnz	L(Exit9)
176	cmp	$9, %ebx
177	je	L(StrlcpyExit9)
178	test	$0x02, %ah
179	jnz	L(Exit10)
180	cmp	$10, %ebx
181	je	L(StrlcpyExit10)
182	test	$0x04, %ah
183	jnz	L(Exit11)
184	cmp	$11, %ebx
185	je	L(StrlcpyExit11)
186	test	$0x8, %ah
187	jnz	L(Exit12)
188	cmp	$12, %ebx
189	je	L(StrlcpyExit12)
190	test	$0x10, %ah
191	jnz	L(Exit13)
192	cmp	$13, %ebx
193	je	L(StrlcpyExit13)
194	test	$0x20, %ah
195	jnz	L(Exit14)
196	cmp	$14, %ebx
197	je	L(StrlcpyExit14)
198	test	$0x40, %ah
199	jnz	L(Exit15)
200	cmp	$15, %ebx
201	je	L(StrlcpyExit15)
202	test	$0x80, %ah
203	jnz	L(Exit16)
204	jmp	L(StrlcpyExit16)
205
206	CFI_PUSH(%esi)
207
208	.p2align 4
209L(CopyFrom1To16BytesCase2OrCase3):
210	test	%eax, %eax
211	jnz	L(CopyFrom1To16BytesCase2)
212
213	.p2align 4
214L(CopyFrom1To16BytesCase3):
215	add	$16, %ebx
216	add	%esi, %edx
217	add	%esi, %ecx
218
219	POP	(%esi)
220
221	cmp	$8, %ebx
222	ja	L(ExitHigh8Case3)
223
224L(CopyFrom1To16BytesLess8Case3):
225	cmp	$4, %ebx
226	ja	L(ExitHigh4Case3)
227
228	cmp	$1, %ebx
229	je	L(StrlcpyExit1)
230	cmp	$2, %ebx
231	je	L(StrlcpyExit2)
232	cmp	$3, %ebx
233	je	L(StrlcpyExit3)
234L(StrlcpyExit4):
235	movb	%bh, 3(%edx)
236	movw	(%ecx), %ax
237	movw	%ax, (%edx)
238	movb	2(%ecx), %al
239	movb	%al, 2(%edx)
240
241	lea	4(%ecx), %edx
242	mov	%edi, %ecx
243        POP     (%edi)
244	jmp	L(CalculateLengthOfSrc)
245        CFI_PUSH     (%edi)
246
247	.p2align 4
248L(ExitHigh4Case3):
249	cmp	$5, %ebx
250	je	L(StrlcpyExit5)
251	cmp	$6, %ebx
252	je	L(StrlcpyExit6)
253	cmp	$7, %ebx
254	je	L(StrlcpyExit7)
255L(StrlcpyExit8):
256	movb	%bh, 7(%edx)
257	movl	(%ecx), %eax
258	movl	%eax, (%edx)
259	movl	3(%ecx), %eax
260	movl	%eax, 3(%edx)
261
262	lea	8(%ecx), %edx
263	mov	%edi, %ecx
264        POP     (%edi)
265	jmp	L(CalculateLengthOfSrc)
266        CFI_PUSH     (%edi)
267
268	.p2align 4
269L(ExitHigh8Case3):
270	cmp	$12, %ebx
271	ja	L(ExitHigh12Case3)
272
273	cmp	$9, %ebx
274	je	L(StrlcpyExit9)
275	cmp	$10, %ebx
276	je	L(StrlcpyExit10)
277	cmp	$11, %ebx
278	je	L(StrlcpyExit11)
279L(StrlcpyExit12):
280	movb	%bh, 11(%edx)
281	movlpd	(%ecx), %xmm0
282	movlpd	%xmm0, (%edx)
283	movl	7(%ecx), %eax
284	movl	%eax, 7(%edx)
285
286	lea	12(%ecx), %edx
287	mov	%edi, %ecx
288        POP     (%edi)
289	jmp	L(CalculateLengthOfSrc)
290        CFI_PUSH     (%edi)
291
292	.p2align 4
293L(ExitHigh12Case3):
294	cmp	$13, %ebx
295	je	L(StrlcpyExit13)
296	cmp	$14, %ebx
297	je	L(StrlcpyExit14)
298	cmp	$15, %ebx
299	je	L(StrlcpyExit15)
300L(StrlcpyExit16):
301	movb	%bh, 15(%edx)
302	movlpd	(%ecx), %xmm0
303	movlpd	%xmm0, (%edx)
304	movlpd	7(%ecx), %xmm0
305	movlpd	%xmm0, 7(%edx)
306
307	lea	16(%ecx), %edx
308	mov	%edi, %ecx
309        POP     (%edi)
310	jmp	L(CalculateLengthOfSrc)
311        CFI_PUSH     (%edi)
312
313	.p2align 4
314L(StrlcpyExit1):
315	movb	%bh, (%edx)
316
317	lea	1(%ecx), %edx
318	mov	%edi, %ecx
319        POP     (%edi)
320	jmp	L(CalculateLengthOfSrc)
321        CFI_PUSH     (%edi)
322
323	.p2align 4
324L(Exit1):
325	movb	(%ecx), %al
326	movb	%al, (%edx)
327
328	mov	%ecx, %eax
329	sub	%edi, %eax
330	RETURN1
331
332	.p2align 4
333L(StrlcpyExit2):
334	movb	%bh, 1(%edx)
335	movb	(%ecx), %al
336	movb	%al, (%edx)
337
338	lea	2(%ecx), %edx
339	mov	%edi, %ecx
340        POP     (%edi)
341	jmp	L(CalculateLengthOfSrc)
342        CFI_PUSH     (%edi)
343
344	.p2align 4
345L(Exit2):
346	movw	(%ecx), %ax
347	movw	%ax, (%edx)
348	movl	%edi, %eax
349
350	lea	1(%ecx), %eax
351	sub	%edi, %eax
352	RETURN1
353
354	.p2align 4
355L(StrlcpyExit3):
356	movb	%bh, 2(%edx)
357	movw	(%ecx), %ax
358	movw	%ax, (%edx)
359
360	lea	3(%ecx), %edx
361	mov	%edi, %ecx
362        POP     (%edi)
363	jmp	L(CalculateLengthOfSrc)
364        CFI_PUSH     (%edi)
365
366	.p2align 4
367L(Exit3):
368	movw	(%ecx), %ax
369	movw	%ax, (%edx)
370	movb	2(%ecx), %al
371	movb	%al, 2(%edx)
372
373	lea	2(%ecx), %eax
374	sub	%edi, %eax
375	RETURN1
376
377	.p2align 4
378L(StrlcpyExit5):
379	movb	%bh, 4(%edx)
380	movl	(%ecx), %eax
381	movl	%eax, (%edx)
382	movl	%edi, %eax
383
384	lea	5(%ecx), %edx
385	mov	%edi, %ecx
386        POP     (%edi)
387	jmp	L(CalculateLengthOfSrc)
388        CFI_PUSH     (%edi)
389
390	.p2align 4
391L(Exit5):
392	movl	(%ecx), %eax
393	movl	%eax, (%edx)
394	movb	4(%ecx), %al
395	movb	%al, 4(%edx)
396
397	lea	4(%ecx), %eax
398	sub	%edi, %eax
399	RETURN1
400
401	.p2align 4
402L(StrlcpyExit6):
403	movb	%bh, 5(%edx)
404	movl	(%ecx), %eax
405	movl	%eax, (%edx)
406	movb	4(%ecx), %al
407	movb	%al, 4(%edx)
408
409	lea	6(%ecx), %edx
410	mov	%edi, %ecx
411        POP     (%edi)
412	jmp	L(CalculateLengthOfSrc)
413        CFI_PUSH     (%edi)
414
415	.p2align 4
416L(Exit6):
417	movl	(%ecx), %eax
418	movl	%eax, (%edx)
419	movw	4(%ecx), %ax
420	movw	%ax, 4(%edx)
421
422	lea	5(%ecx), %eax
423	sub	%edi, %eax
424	RETURN1
425
426	.p2align 4
427L(StrlcpyExit7):
428	movb	%bh, 6(%edx)
429	movl	(%ecx), %eax
430	movl	%eax, (%edx)
431	movw	4(%ecx), %ax
432	movw	%ax, 4(%edx)
433
434	lea	7(%ecx), %edx
435	mov	%edi, %ecx
436        POP     (%edi)
437	jmp	L(CalculateLengthOfSrc)
438        CFI_PUSH     (%edi)
439
440	.p2align 4
441L(Exit7):
442	movl	(%ecx), %eax
443	movl	%eax, (%edx)
444	movl	3(%ecx), %eax
445	movl	%eax, 3(%edx)
446
447	lea	6(%ecx), %eax
448	sub	%edi, %eax
449	RETURN1
450
451	.p2align 4
452L(StrlcpyExit9):
453	movb	%bh, 8(%edx)
454	movlpd	(%ecx), %xmm0
455	movlpd	%xmm0, (%edx)
456
457	lea	9(%ecx), %edx
458	mov	%edi, %ecx
459        POP     (%edi)
460	jmp	L(CalculateLengthOfSrc)
461        CFI_PUSH     (%edi)
462
463	.p2align 4
464L(Exit9):
465	movlpd	(%ecx), %xmm0
466	movlpd	%xmm0, (%edx)
467	movb	8(%ecx), %al
468	movb	%al, 8(%edx)
469
470	lea	8(%ecx), %eax
471	sub	%edi, %eax
472	RETURN1
473
474	.p2align 4
475L(StrlcpyExit10):
476	movb	%bh, 9(%edx)
477	movlpd	(%ecx), %xmm0
478	movlpd	%xmm0, (%edx)
479	movb	8(%ecx), %al
480	movb	%al, 8(%edx)
481
482	lea	10(%ecx), %edx
483	mov	%edi, %ecx
484        POP     (%edi)
485	jmp	L(CalculateLengthOfSrc)
486        CFI_PUSH     (%edi)
487
488	.p2align 4
489L(Exit10):
490	movlpd	(%ecx), %xmm0
491	movlpd	%xmm0, (%edx)
492	movw	8(%ecx), %ax
493	movw	%ax, 8(%edx)
494
495	lea	9(%ecx), %eax
496	sub	%edi, %eax
497	RETURN1
498
499	.p2align 4
500L(StrlcpyExit11):
501	movb	%bh, 10(%edx)
502	movlpd	(%ecx), %xmm0
503	movlpd	%xmm0, (%edx)
504	movw	8(%ecx), %ax
505	movw	%ax, 8(%edx)
506
507	lea	11(%ecx), %edx
508	mov	%edi, %ecx
509        POP     (%edi)
510	jmp	L(CalculateLengthOfSrc)
511        CFI_PUSH     (%edi)
512
513	.p2align 4
514L(Exit11):
515	movlpd	(%ecx), %xmm0
516	movlpd	%xmm0, (%edx)
517	movl	7(%ecx), %eax
518	movl	%eax, 7(%edx)
519
520	lea	10(%ecx), %eax
521	sub	%edi, %eax
522	RETURN1
523
524	.p2align 4
525L(StrlcpyExit13):
526	movb	%bh, 12(%edx)
527	movlpd	(%ecx), %xmm0
528	movlpd	%xmm0, (%edx)
529	movl	8(%ecx), %eax
530	movl	%eax, 8(%edx)
531
532	lea	13(%ecx), %edx
533	mov	%edi, %ecx
534        POP     (%edi)
535	jmp	L(CalculateLengthOfSrc)
536        CFI_PUSH     (%edi)
537
538	.p2align 4
539L(Exit13):
540	movlpd	(%ecx), %xmm0
541	movlpd	%xmm0, (%edx)
542	movlpd	5(%ecx), %xmm0
543	movlpd	%xmm0, 5(%edx)
544
545	lea	12(%ecx), %eax
546	sub	%edi, %eax
547	RETURN1
548
549	.p2align 4
550L(StrlcpyExit14):
551	movb	%bh, 13(%edx)
552	movlpd	(%ecx), %xmm0
553	movlpd	%xmm0, (%edx)
554	movlpd	5(%ecx), %xmm0
555	movlpd	%xmm0, 5(%edx)
556
557	lea	14(%ecx), %edx
558	mov	%edi, %ecx
559        POP     (%edi)
560	jmp	L(CalculateLengthOfSrc)
561        CFI_PUSH     (%edi)
562
563	.p2align 4
564L(Exit14):
565	movlpd	(%ecx), %xmm0
566	movlpd	%xmm0, (%edx)
567	movlpd	6(%ecx), %xmm0
568	movlpd	%xmm0, 6(%edx)
569
570	lea	13(%ecx), %eax
571	sub	%edi, %eax
572	RETURN1
573
574	.p2align 4
575L(StrlcpyExit15):
576	movb	%bh, 14(%edx)
577	movlpd	(%ecx), %xmm0
578	movlpd	%xmm0, (%edx)
579	movlpd	6(%ecx), %xmm0
580	movlpd	%xmm0, 6(%edx)
581
582	lea	15(%ecx), %edx
583	mov	%edi, %ecx
584        POP     (%edi)
585	jmp	L(CalculateLengthOfSrc)
586        CFI_PUSH     (%edi)
587
588	.p2align 4
589L(Exit15):
590	movlpd	(%ecx), %xmm0
591	movlpd	%xmm0, (%edx)
592	movlpd	7(%ecx), %xmm0
593	movlpd	%xmm0, 7(%edx)
594
595	lea	14(%ecx), %eax
596	sub	%edi, %eax
597	RETURN1
598
599        CFI_POP (%edi)
600
601	.p2align 4
602L(StrlcpyExit0):
603	movl	$0, %eax
604	RETURN
605
606	.p2align 4
607L(StrncpyExit15Bytes):
608	cmp	$12, %ebx
609	ja	L(StrncpyExit15Bytes1)
610
611	cmpb	$0, 8(%ecx)
612	jz	L(ExitTail9)
613	cmp	$9, %ebx
614	je	L(StrlcpyExitTail9)
615
616	cmpb	$0, 9(%ecx)
617	jz	L(ExitTail10)
618	cmp	$10, %ebx
619	je	L(StrlcpyExitTail10)
620
621	cmpb	$0, 10(%ecx)
622	jz	L(ExitTail11)
623	cmp	$11, %ebx
624	je	L(StrlcpyExitTail11)
625
626	cmpb	$0, 11(%ecx)
627	jz	L(ExitTail12)
628
629	movb	%bh, 11(%edx)
630	movlpd	(%ecx), %xmm0
631	movlpd	%xmm0, (%edx)
632	movl	7(%ecx), %eax
633	movl	%eax, 7(%edx)
634
635	lea	12(%ecx), %edx
636	jmp	L(CalculateLengthOfSrc)
637
638	.p2align 4
639L(StrncpyExit15Bytes1):
640	cmpb	$0, 8(%ecx)
641	jz	L(ExitTail9)
642	cmpb	$0, 9(%ecx)
643	jz	L(ExitTail10)
644	cmpb	$0, 10(%ecx)
645	jz	L(ExitTail11)
646	cmpb	$0, 11(%ecx)
647	jz	L(ExitTail12)
648
649	cmpb	$0, 12(%ecx)
650	jz	L(ExitTail13)
651	cmp	$13, %ebx
652	je	L(StrlcpyExitTail13)
653
654	cmpb	$0, 13(%ecx)
655	jz	L(ExitTail14)
656	cmp	$14, %ebx
657	je	L(StrlcpyExitTail14)
658
659	cmpb	$0, 14(%ecx)
660	jz	L(ExitTail15)
661
662	movb	%bh, 14(%edx)
663	movlpd	(%ecx), %xmm0
664	movlpd	%xmm0, (%edx)
665	movlpd	6(%ecx), %xmm0
666	movlpd	%xmm0, 6(%edx)
667
668	lea	15(%ecx), %edx
669	jmp	L(CalculateLengthOfSrc)
670
671	.p2align 4
672L(StrncpyExit8Bytes):
673	cmp	$4, %ebx
674	ja	L(StrncpyExit8Bytes1)
675
676	test	%ebx, %ebx
677	jz	L(StrlcpyExitTail0)
678
679	cmpb	$0, (%ecx)
680	jz	L(ExitTail1)
681	cmp	$1, %ebx
682	je	L(StrlcpyExitTail1)
683
684	cmpb	$0, 1(%ecx)
685	jz	L(ExitTail2)
686	cmp	$2, %ebx
687	je	L(StrlcpyExitTail2)
688
689	cmpb	$0, 2(%ecx)
690	jz	L(ExitTail3)
691	cmp	$3, %ebx
692	je	L(StrlcpyExitTail3)
693
694	cmpb	$0, 3(%ecx)
695	jz	L(ExitTail4)
696
697	movb	%bh, 3(%edx)
698	movw	(%ecx), %ax
699	movw	%ax, (%edx)
700	movb	2(%ecx), %al
701	movb	%al, 2(%edx)
702
703	lea	4(%ecx), %edx
704	jmp	L(CalculateLengthOfSrc)
705
706	.p2align 4
707L(StrncpyExit8Bytes1):
708	cmpb	$0, (%ecx)
709	jz	L(ExitTail1)
710	cmpb	$0, 1(%ecx)
711	jz	L(ExitTail2)
712	cmpb	$0, 2(%ecx)
713	jz	L(ExitTail3)
714	cmpb	$0, 3(%ecx)
715	jz	L(ExitTail4)
716
717	cmpb	$0, 4(%ecx)
718	jz	L(ExitTail5)
719	cmp	$5, %ebx
720	je	L(StrlcpyExitTail5)
721
722	cmpb	$0, 5(%ecx)
723	jz	L(ExitTail6)
724	cmp	$6, %ebx
725	je	L(StrlcpyExitTail6)
726
727	cmpb	$0, 6(%ecx)
728	jz	L(ExitTail7)
729	cmp	$7, %ebx
730	je	L(StrlcpyExitTail7)
731
732	cmpb	$0, 7(%ecx)
733	jz	L(ExitTail8)
734
735	movb	%bh, 7(%edx)
736	movl	(%ecx), %eax
737	movl	%eax, (%edx)
738	movl	3(%ecx), %eax
739	movl	%eax, 3(%edx)
740
741	lea	8(%ecx), %edx
742	jmp	L(CalculateLengthOfSrc)
743
744	.p2align 4
745L(StrlcpyExitTail0):
746	mov	%ecx, %edx
747	jmp	L(CalculateLengthOfSrc)
748
749	.p2align 4
750L(StrlcpyExitTail1):
751	movb	%bh, (%edx)
752
753	lea	1(%ecx), %edx
754	jmp	L(CalculateLengthOfSrc)
755
756	.p2align 4
757L(ExitTail1):
758	movb	(%ecx), %al
759	movb	%al, (%edx)
760
761	mov	$0, %eax
762	RETURN
763
764	.p2align 4
765L(StrlcpyExitTail2):
766	movb	%bh, 1(%edx)
767	movb	(%ecx), %al
768	movb	%al, (%edx)
769
770	lea	2(%ecx), %edx
771	jmp	L(CalculateLengthOfSrc)
772
773	.p2align 4
774L(ExitTail2):
775	movw	(%ecx), %ax
776	movw	%ax, (%edx)
777	movl	%edx, %eax
778
779	mov	$1, %eax
780	RETURN
781
782	.p2align 4
783L(StrlcpyExitTail3):
784	movb	%bh, 2(%edx)
785	movw	(%ecx), %ax
786	movw	%ax, (%edx)
787
788	lea	3(%ecx), %edx
789	jmp	L(CalculateLengthOfSrc)
790
791	.p2align 4
792L(ExitTail3):
793	movw	(%ecx), %ax
794	movw	%ax, (%edx)
795	movb	2(%ecx), %al
796	movb	%al, 2(%edx)
797
798	mov	$2, %eax
799	RETURN
800
801	.p2align 4
802L(ExitTail4):
803	movl	(%ecx), %eax
804	movl	%eax, (%edx)
805
806	mov	$3, %eax
807	RETURN
808
809	.p2align 4
810L(StrlcpyExitTail5):
811	movb	%bh, 4(%edx)
812	movl	(%ecx), %eax
813	movl	%eax, (%edx)
814	movl	%edx, %eax
815
816	lea	5(%ecx), %edx
817	jmp	L(CalculateLengthOfSrc)
818
819	.p2align 4
820L(ExitTail5):
821	movl	(%ecx), %eax
822	movl	%eax, (%edx)
823	movb	4(%ecx), %al
824	movb	%al, 4(%edx)
825
826	mov	$4, %eax
827	RETURN
828
829	.p2align 4
830L(StrlcpyExitTail6):
831	movb	%bh, 5(%edx)
832	movl	(%ecx), %eax
833	movl	%eax, (%edx)
834	movb	4(%ecx), %al
835	movb	%al, 4(%edx)
836
837	lea	6(%ecx), %edx
838	jmp	L(CalculateLengthOfSrc)
839
840	.p2align 4
841L(ExitTail6):
842	movl	(%ecx), %eax
843	movl	%eax, (%edx)
844	movw	4(%ecx), %ax
845	movw	%ax, 4(%edx)
846
847	mov	$5, %eax
848	RETURN
849
850	.p2align 4
851L(StrlcpyExitTail7):
852	movb	%bh, 6(%edx)
853	movl	(%ecx), %eax
854	movl	%eax, (%edx)
855	movw	4(%ecx), %ax
856	movw	%ax, 4(%edx)
857
858	lea	7(%ecx), %edx
859	jmp	L(CalculateLengthOfSrc)
860
861	.p2align 4
862L(ExitTail7):
863	movl	(%ecx), %eax
864	movl	%eax, (%edx)
865	movl	3(%ecx), %eax
866	movl	%eax, 3(%edx)
867
868	mov	$6, %eax
869	RETURN
870
871	.p2align 4
872L(ExitTail8):
873	movlpd	(%ecx), %xmm0
874	movlpd	%xmm0, (%edx)
875
876	mov	$7, %eax
877	RETURN
878
879	.p2align 4
880L(StrlcpyExitTail9):
881	movb	%bh, 8(%edx)
882	movlpd	(%ecx), %xmm0
883	movlpd	%xmm0, (%edx)
884
885	lea	9(%ecx), %edx
886	jmp	L(CalculateLengthOfSrc)
887
888	.p2align 4
889L(ExitTail9):
890	movlpd	(%ecx), %xmm0
891	movlpd	%xmm0, (%edx)
892	movb	8(%ecx), %al
893	movb	%al, 8(%edx)
894
895	mov	$8, %eax
896	RETURN
897
898	.p2align 4
899L(StrlcpyExitTail10):
900	movb	%bh, 9(%edx)
901	movlpd	(%ecx), %xmm0
902	movlpd	%xmm0, (%edx)
903	movb	8(%ecx), %al
904	movb	%al, 8(%edx)
905
906	lea	10(%ecx), %edx
907	jmp	L(CalculateLengthOfSrc)
908
909	.p2align 4
910L(ExitTail10):
911	movlpd	(%ecx), %xmm0
912	movlpd	%xmm0, (%edx)
913	movw	8(%ecx), %ax
914	movw	%ax, 8(%edx)
915
916	mov	$9, %eax
917	RETURN
918
919	.p2align 4
920L(StrlcpyExitTail11):
921	movb	%bh, 10(%edx)
922	movlpd	(%ecx), %xmm0
923	movlpd	%xmm0, (%edx)
924	movw	8(%ecx), %ax
925	movw	%ax, 8(%edx)
926
927	lea	11(%ecx), %edx
928	jmp	L(CalculateLengthOfSrc)
929
930	.p2align 4
931L(ExitTail11):
932	movlpd	(%ecx), %xmm0
933	movlpd	%xmm0, (%edx)
934	movl	7(%ecx), %eax
935	movl	%eax, 7(%edx)
936
937	mov	$10, %eax
938	RETURN
939
940	.p2align 4
941L(ExitTail12):
942	movlpd	(%ecx), %xmm0
943	movlpd	%xmm0, (%edx)
944	movl	8(%ecx), %eax
945	movl	%eax, 8(%edx)
946
947	mov	$11, %eax
948	RETURN
949
950	.p2align 4
951L(StrlcpyExitTail13):
952	movb	%bh, 12(%edx)
953	movlpd	(%ecx), %xmm0
954	movlpd	%xmm0, (%edx)
955	movl	8(%ecx), %eax
956	movl	%eax, 8(%edx)
957
958	lea	13(%ecx), %edx
959	jmp	L(CalculateLengthOfSrc)
960
961	.p2align 4
962L(ExitTail13):
963	movlpd	(%ecx), %xmm0
964	movlpd	%xmm0, (%edx)
965	movlpd	5(%ecx), %xmm0
966	movlpd	%xmm0, 5(%edx)
967
968	mov	$12, %eax
969	RETURN
970
971	.p2align 4
972L(StrlcpyExitTail14):
973	movb	%bh, 13(%edx)
974	movlpd	(%ecx), %xmm0
975	movlpd	%xmm0, (%edx)
976	movlpd	5(%ecx), %xmm0
977	movlpd	%xmm0, 5(%edx)
978
979	lea	14(%ecx), %edx
980	jmp	L(CalculateLengthOfSrc)
981
982	.p2align 4
983L(ExitTail14):
984	movlpd	(%ecx), %xmm0
985	movlpd	%xmm0, (%edx)
986	movlpd	6(%ecx), %xmm0
987	movlpd	%xmm0, 6(%edx)
988
989	mov	$13, %eax
990	RETURN
991
992	.p2align 4
993L(ExitTail15):
994	movlpd	(%ecx), %xmm0
995	movlpd	%xmm0, (%edx)
996	movlpd	7(%ecx), %xmm0
997	movlpd	%xmm0, 7(%edx)
998
999	mov	$14, %eax
1000	RETURN
1001
1002	.p2align 4
1003L(StrlcpyExitTail16):
1004	movb	%bh, 15(%edx)
1005	movlpd	(%ecx), %xmm0
1006	movlpd	%xmm0, (%edx)
1007	movlpd	7(%ecx), %xmm0
1008	movlpd	%xmm0, 7(%edx)
1009
1010	lea	16(%ecx), %edx
1011	jmp	L(CalculateLengthOfSrc)
1012
1013	.p2align 4
1014L(ExitTail16):
1015	movlpd	(%ecx), %xmm0
1016	movlpd	8(%ecx), %xmm1
1017	movlpd	%xmm0, (%edx)
1018	movlpd	%xmm1, 8(%edx)
1019
1020	mov	$15, %eax
1021	RETURN
1022
1023	.p2align 4
1024L(CalculateLengthOfSrc):
1025	xor	%eax, %eax
1026	cmpb	$0, (%edx)
1027	jz	L(exit_tail0)
1028	cmpb	$0, 1(%edx)
1029	jz	L(exit_tail1)
1030	cmpb	$0, 2(%edx)
1031	jz	L(exit_tail2)
1032	cmpb	$0, 3(%edx)
1033	jz	L(exit_tail3)
1034
1035	cmpb	$0, 4(%edx)
1036	jz	L(exit_tail4)
1037	cmpb	$0, 5(%edx)
1038	jz	L(exit_tail5)
1039	cmpb	$0, 6(%edx)
1040	jz	L(exit_tail6)
1041	cmpb	$0, 7(%edx)
1042	jz	L(exit_tail7)
1043
1044	cmpb	$0, 8(%edx)
1045	jz	L(exit_tail8)
1046	cmpb	$0, 9(%edx)
1047	jz	L(exit_tail9)
1048	cmpb	$0, 10(%edx)
1049	jz	L(exit_tail10)
1050	cmpb	$0, 11(%edx)
1051	jz	L(exit_tail11)
1052
1053	cmpb	$0, 12(%edx)
1054	jz	L(exit_tail12)
1055	cmpb	$0, 13(%edx)
1056	jz	L(exit_tail13)
1057	cmpb	$0, 14(%edx)
1058	jz	L(exit_tail14)
1059	cmpb	$0, 15(%edx)
1060	jz	L(exit_tail15)
1061
1062	pxor	%xmm0, %xmm0
1063	lea	16(%edx), %eax
1064	add	$16, %ecx
1065	and	$-16, %eax
1066
1067	pcmpeqb	(%eax), %xmm0
1068	pmovmskb %xmm0, %edx
1069	pxor	%xmm1, %xmm1
1070	lea	16(%eax), %eax
1071	test	%edx, %edx
1072	jnz	L(exit)
1073
1074	pcmpeqb	(%eax), %xmm1
1075	pmovmskb %xmm1, %edx
1076	pxor	%xmm2, %xmm2
1077	lea	16(%eax), %eax
1078	test	%edx, %edx
1079	jnz	L(exit)
1080
1081	pcmpeqb	(%eax), %xmm2
1082	pmovmskb %xmm2, %edx
1083	pxor	%xmm3, %xmm3
1084	lea	16(%eax), %eax
1085	test	%edx, %edx
1086	jnz	L(exit)
1087
1088	pcmpeqb	(%eax), %xmm3
1089	pmovmskb %xmm3, %edx
1090	lea	16(%eax), %eax
1091	test	%edx, %edx
1092	jnz	L(exit)
1093
1094	pcmpeqb	(%eax), %xmm0
1095	pmovmskb %xmm0, %edx
1096	lea	16(%eax), %eax
1097	test	%edx, %edx
1098	jnz	L(exit)
1099
1100	pcmpeqb	(%eax), %xmm1
1101	pmovmskb %xmm1, %edx
1102	lea	16(%eax), %eax
1103	test	%edx, %edx
1104	jnz	L(exit)
1105
1106	pcmpeqb	(%eax), %xmm2
1107	pmovmskb %xmm2, %edx
1108	lea	16(%eax), %eax
1109	test	%edx, %edx
1110	jnz	L(exit)
1111
1112	pcmpeqb	(%eax), %xmm3
1113	pmovmskb %xmm3, %edx
1114	lea	16(%eax), %eax
1115	test	%edx, %edx
1116	jnz	L(exit)
1117
1118	pcmpeqb	(%eax), %xmm0
1119	pmovmskb %xmm0, %edx
1120	lea	16(%eax), %eax
1121	test	%edx, %edx
1122	jnz	L(exit)
1123
1124	pcmpeqb	(%eax), %xmm1
1125	pmovmskb %xmm1, %edx
1126	lea	16(%eax), %eax
1127	test	%edx, %edx
1128	jnz	L(exit)
1129
1130	pcmpeqb	(%eax), %xmm2
1131	pmovmskb %xmm2, %edx
1132	lea	16(%eax), %eax
1133	test	%edx, %edx
1134	jnz	L(exit)
1135
1136	pcmpeqb	(%eax), %xmm3
1137	pmovmskb %xmm3, %edx
1138	lea	16(%eax), %eax
1139	test	%edx, %edx
1140	jnz	L(exit)
1141
1142	pcmpeqb	(%eax), %xmm0
1143	pmovmskb %xmm0, %edx
1144	lea	16(%eax), %eax
1145	test	%edx, %edx
1146	jnz	L(exit)
1147
1148	pcmpeqb	(%eax), %xmm1
1149	pmovmskb %xmm1, %edx
1150	lea	16(%eax), %eax
1151	test	%edx, %edx
1152	jnz	L(exit)
1153
1154	pcmpeqb	(%eax), %xmm2
1155	pmovmskb %xmm2, %edx
1156	lea	16(%eax), %eax
1157	test	%edx, %edx
1158	jnz	L(exit)
1159
1160	pcmpeqb	(%eax), %xmm3
1161	pmovmskb %xmm3, %edx
1162	lea	16(%eax), %eax
1163	test	%edx, %edx
1164	jnz	L(exit)
1165
1166	and	$-0x40, %eax
1167
1168	.p2align 4
1169L(aligned_64_loop):
1170	movaps	(%eax), %xmm0
1171	movaps	16(%eax), %xmm1
1172	movaps	32(%eax), %xmm2
1173	movaps	48(%eax), %xmm6
1174	pminub	%xmm1, %xmm0
1175	pminub	%xmm6, %xmm2
1176	pminub	%xmm0, %xmm2
1177	pcmpeqb	%xmm3, %xmm2
1178	pmovmskb %xmm2, %edx
1179	lea	64(%eax), %eax
1180	test	%edx, %edx
1181	jz	L(aligned_64_loop)
1182
1183	pcmpeqb	-64(%eax), %xmm3
1184	pmovmskb %xmm3, %edx
1185	lea	48(%ecx), %ecx
1186	test	%edx, %edx
1187	jnz	L(exit)
1188
1189	pcmpeqb	%xmm1, %xmm3
1190	pmovmskb %xmm3, %edx
1191	lea	-16(%ecx), %ecx
1192	test	%edx, %edx
1193	jnz	L(exit)
1194
1195	pcmpeqb	-32(%eax), %xmm3
1196	pmovmskb %xmm3, %edx
1197	lea	-16(%ecx), %ecx
1198	test	%edx, %edx
1199	jnz	L(exit)
1200
1201	pcmpeqb	%xmm6, %xmm3
1202	pmovmskb %xmm3, %edx
1203	lea	-16(%ecx), %ecx
1204
1205	.p2align 4
1206L(exit):
1207	sub	%ecx, %eax
1208	test	%dl, %dl
1209	jz	L(exit_more_8)
1210
1211	mov	%dl, %cl
1212	and	$15, %cl
1213	jz	L(exit_more_4)
1214	test	$0x01, %dl
1215	jnz	L(exit_0)
1216	test	$0x02, %dl
1217	jnz	L(exit_1)
1218	test	$0x04, %dl
1219	jnz	L(exit_2)
1220	add	$3, %eax
1221	RETURN
1222
1223	.p2align 4
1224L(exit_more_4):
1225	test	$0x10, %dl
1226	jnz	L(exit_4)
1227	test	$0x20, %dl
1228	jnz	L(exit_5)
1229	test	$0x40, %dl
1230	jnz	L(exit_6)
1231	add	$7, %eax
1232	RETURN
1233
1234	.p2align 4
1235L(exit_more_8):
1236	mov	%dh, %ch
1237	and	$15, %ch
1238	jz	L(exit_more_12)
1239	test	$0x01, %dh
1240	jnz	L(exit_8)
1241	test	$0x02, %dh
1242	jnz	L(exit_9)
1243	test	$0x04, %dh
1244	jnz	L(exit_10)
1245	add	$11, %eax
1246	RETURN
1247
1248	.p2align 4
1249L(exit_more_12):
1250	test	$0x10, %dh
1251	jnz	L(exit_12)
1252	test	$0x20, %dh
1253	jnz	L(exit_13)
1254	test	$0x40, %dh
1255	jnz	L(exit_14)
1256	add	$15, %eax
1257L(exit_0):
1258	RETURN
1259
1260	.p2align 4
1261L(exit_1):
1262	add	$1, %eax
1263	RETURN
1264
1265L(exit_2):
1266	add	$2, %eax
1267	RETURN
1268
1269L(exit_3):
1270	add	$3, %eax
1271	RETURN
1272
1273L(exit_4):
1274	add	$4, %eax
1275	RETURN
1276
1277L(exit_5):
1278	add	$5, %eax
1279	RETURN
1280
1281L(exit_6):
1282	add	$6, %eax
1283	RETURN
1284
1285L(exit_7):
1286	add	$7, %eax
1287	RETURN
1288
1289L(exit_8):
1290	add	$8, %eax
1291	RETURN
1292
1293L(exit_9):
1294	add	$9, %eax
1295	RETURN
1296
1297L(exit_10):
1298	add	$10, %eax
1299	RETURN
1300
1301L(exit_11):
1302	add	$11, %eax
1303	RETURN
1304
1305L(exit_12):
1306	add	$12, %eax
1307	RETURN
1308
1309L(exit_13):
1310	add	$13, %eax
1311	RETURN
1312
1313L(exit_14):
1314	add	$14, %eax
1315	RETURN
1316
1317L(exit_15):
1318	add	$15, %eax
1319	RETURN
1320
1321L(exit_tail0):
1322	mov	%edx, %eax
1323	sub	%ecx, %eax
1324	RETURN
1325
1326	.p2align 4
1327L(exit_tail1):
1328	lea	1(%edx), %eax
1329	sub	%ecx, %eax
1330	RETURN
1331
1332L(exit_tail2):
1333	lea	2(%edx), %eax
1334	sub	%ecx, %eax
1335	RETURN
1336
1337L(exit_tail3):
1338	lea	3(%edx), %eax
1339	sub	%ecx, %eax
1340	RETURN
1341
1342L(exit_tail4):
1343	lea	4(%edx), %eax
1344	sub	%ecx, %eax
1345	RETURN
1346
1347L(exit_tail5):
1348	lea	5(%edx), %eax
1349	sub	%ecx, %eax
1350	RETURN
1351
1352L(exit_tail6):
1353	lea	6(%edx), %eax
1354	sub	%ecx, %eax
1355	RETURN
1356
1357L(exit_tail7):
1358	lea	7(%edx), %eax
1359	sub	%ecx, %eax
1360	RETURN
1361
1362L(exit_tail8):
1363	lea	8(%edx), %eax
1364	sub	%ecx, %eax
1365	RETURN
1366
1367L(exit_tail9):
1368	lea	9(%edx), %eax
1369	sub	%ecx, %eax
1370	RETURN
1371
1372L(exit_tail10):
1373	lea	10(%edx), %eax
1374	sub	%ecx, %eax
1375	RETURN
1376
1377L(exit_tail11):
1378	lea	11(%edx), %eax
1379	sub	%ecx, %eax
1380	RETURN
1381
1382L(exit_tail12):
1383	lea	12(%edx), %eax
1384	sub	%ecx, %eax
1385	RETURN
1386
1387L(exit_tail13):
1388	lea	13(%edx), %eax
1389	sub	%ecx, %eax
1390	RETURN
1391
1392L(exit_tail14):
1393	lea	14(%edx), %eax
1394	sub	%ecx, %eax
1395	RETURN
1396
1397L(exit_tail15):
1398	lea	15(%edx), %eax
1399	sub	%ecx, %eax
1400	RETURN
1401
1402END (STRCPY)
1403
1404