1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc	.cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc	.cfi_endproc
41#endif
42
43#ifndef ENTRY
44# define ENTRY(name)	\
45	.type name, @function;	\
46	.globl name;	\
47	.p2align 4;	\
48name:	\
49	cfi_startproc
50#endif
51
52#ifndef END
53# define END(name)	\
54       cfi_endproc;	\
55       .size name, .-name
56#endif
57
58
59#ifndef STRLCPY
60# define STRLCPY	strlcpy
61#endif
62
63#define JMPTBL(I, B)	I - B
64#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
65	lea	TABLE(%rip), %r11;	\
66	movslq	(%r11, INDEX, SCALE), %rcx;	\
67	lea	(%r11, %rcx), %rcx;	\
68	jmp	*%rcx
69
70#define RETURN	\
71	add	%r9, %rax;	\
72	ret
73
74.text
75ENTRY (STRLCPY)
76	xor	%rax, %rax
77	xor	%r9, %r9
78	mov	%rdx, %r8
79	cmp	$0, %r8
80	jz	L(CalculateSrcLen)
81
82#ifdef USE_AS_STRLCAT
83	xor	%rcx, %rcx
84	pxor	%xmm0, %xmm0
85
86	movdqu	(%rdi), %xmm1
87	pcmpeqb %xmm1, %xmm0
88	pmovmskb %xmm0, %rdx
89
90	cmp	$17, %r8
91	jb	L(SizeEndCase1)
92	test	%rdx, %rdx
93	jnz	L(StringEndCase1)
94
95	add	$16, %rax
96	movdqu	16(%rdi), %xmm1
97	pcmpeqb %xmm1, %xmm0
98	pmovmskb %xmm0, %rdx
99
100	cmp	$33, %r8
101	jb	L(SizeEndCase1)
102	test	%rdx, %rdx
103	jnz	L(StringEndCase1)
104
105	mov	%rdi, %rcx
106	and	$15, %rcx
107	and	$-16, %rdi
108
109	add	%rcx, %r8
110	sub	$16, %r8
111
112L(DstLenLoop):
113	movdqa	(%rdi, %rax), %xmm1
114	pcmpeqb %xmm1, %xmm0
115	pmovmskb %xmm0, %rdx
116	sub	$16, %r8
117	jbe	L(SizeEndCase2)
118	test	%rdx, %rdx
119	jnz	L(StringEndCase2)
120	add	$16, %rax
121	jmp	L(DstLenLoop)
122
123L(StringEndCase2):
124	add	$16, %r8
125	bsf	%rdx, %rdx
126	sub	%rdx, %r8
127	add	%rdx, %rax
128	sub	%rcx, %r9
129	add	%rax, %rdi
130	jmp	 L(CopySrcString)
131
132L(SizeEndCase1):
133	test	%rdx, %rdx
134	jz	L(SizeEnd)
135	bsf	%rdx, %rdx
136	add	%rdx, %rax
137	cmp	%r8, %rax
138	jb	L(StringEnd)
139L(SizeEnd):
140	mov	%r8, %r9
141	jmp	L(CalculateSrcLenCase1)
142
143L(SizeEndCase2):
144	add	$16, %r8
145	test	%rdx, %rdx
146	jz	L(StringEndCase4)
147	bsf	%rdx, %rdx
148	cmp	%r8, %rdx
149	jb	L(StringEndCase3)
150L(StringEndCase4):
151	add	%r8, %rax
152	sub	%rcx, %rax
153	mov	%rax, %r9
154	jmp	L(CalculateSrcLenCase1)
155
156L(StringEndCase3):
157	add	%rdx, %rax
158	sub	%rcx, %r9
159	add	%rax, %rdi
160	sub	%rdx, %r8
161	jmp	L(CopySrcString)
162
163L(StringEndCase1):
164	bsf	%rdx, %rdx
165	add	%rdx, %rax
166	sub	%rcx, %rax
167L(StringEnd):
168	add	%rax, %rdi
169	sub	%rax, %r8
170#endif
171
172	mov	%rsi, %rcx
173	and	$63, %rcx
174	cmp	$32, %rcx
175	jbe	L(CopySrcString)
176
177	and	$-16, %rsi
178	and	$15, %rcx
179	pxor	%xmm0, %xmm0
180	pxor	%xmm1, %xmm1
181
182	pcmpeqb	(%rsi), %xmm1
183	pmovmskb %xmm1, %rdx
184	shr	%cl, %rdx
185	mov	$16, %r10
186	sub	%rcx, %r10
187	cmp	%r10, %r8
188	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
189	test	%rdx, %rdx
190	jnz	L(CopyFrom1To16BytesTail)
191
192	pcmpeqb	16(%rsi), %xmm0
193	pmovmskb %xmm0, %rdx
194	add	$16, %r10
195	cmp	%r10, %r8
196	jbe	L(CopyFrom1To32BytesCase2OrCase3)
197	test	%rdx, %rdx
198	jnz	L(CopyFrom1To32Bytes)
199
200	movdqu	(%rsi, %rcx), %xmm1
201	movdqu	%xmm1, (%rdi)
202#ifdef USE_AS_STRLCAT
203	add	%rax, %r9
204#endif
205	jmp	L(LoopStart)
206
207	.p2align 4
208L(CopySrcString):
209#ifdef USE_AS_STRLCAT
210	add	%rax, %r9
211	xor	%rax, %rax
212#endif
213	pxor	%xmm0, %xmm0
214	movdqu	(%rsi), %xmm1
215	pcmpeqb	%xmm1, %xmm0
216	pmovmskb %xmm0, %rdx
217
218	cmp	$17, %r8
219	jb	L(CopyFrom1To16BytesTail1Case2OrCase3)
220	test	%rdx, %rdx
221	jnz	L(CopyFrom1To16BytesTail1)
222
223	movdqu	16(%rsi), %xmm2
224	pcmpeqb	%xmm2, %xmm0
225	movdqu	%xmm1, (%rdi)
226	pmovmskb %xmm0, %rdx
227	add	$16, %rax
228
229	cmp	$33, %r8
230	jb	L(CopyFrom1To32Bytes1Case2OrCase3)
231	test	%rdx, %rdx
232	jnz	L(CopyFrom1To32Bytes1)
233
234	mov	%rsi, %rcx
235	and	$15, %rcx
236	and	$-16, %rsi
237
238L(LoopStart):
239	sub	%rcx, %rdi
240	add	%rcx, %r8
241	sub	$16, %r8
242	mov	$16, %rax
243
244L(16Loop):
245	movdqa	(%rsi, %rax), %xmm1
246	pcmpeqb	%xmm1, %xmm0
247	pmovmskb %xmm0, %rdx
248	sub	$16, %r8
249	jbe	L(CopyFrom1To16BytesCase2OrCase3)
250	test	%rdx, %rdx
251	jnz	L(CopyFrom1To16BytesXmmExit)
252	movdqu	%xmm1, (%rdi, %rax)
253	add	$16, %rax
254	jmp	L(16Loop)
255
256/*------End of main part with loops---------------------*/
257
258/* Case1 */
259	.p2align 4
260L(CopyFrom1To16Bytes):
261	add	%rcx, %rdi
262	add	%rcx, %rsi
263	bsf	%rdx, %rdx
264	add	%rdx, %rax
265	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
266
267	.p2align 4
268L(CopyFrom1To16BytesTail):
269	add	%rcx, %rsi
270	bsf	%rdx, %rdx
271	add	%rdx, %rax
272	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
273
274	.p2align 4
275L(CopyFrom1To32Bytes1):
276	add	$16, %rsi
277	add	$16, %rdi
278	sub	$16, %r8
279L(CopyFrom1To16BytesTail1):
280	bsf	%rdx, %rdx
281	add	%rdx, %rax
282	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
283
284	.p2align 4
285L(CopyFrom1To32Bytes):
286	bsf	%rdx, %rdx
287	add	%rcx, %rsi
288	add	$16, %rdx
289	sub	%rcx, %rdx
290	add	%rdx, %rax
291	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
292
293	.p2align 4
294L(CopyFrom1To16BytesExit):
295	add	%rdx, %rax
296	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
297
298/* Case2 */
299
300	.p2align 4
301L(CopyFrom1To16BytesCase2):
302	add	$16, %r8
303	add	%rax, %rdi
304	add	%rax, %rsi
305	bsf	%rdx, %rdx
306	sub	%rcx, %rax
307	cmp	%r8, %rdx
308	jb	L(CopyFrom1To16BytesExit)
309	add	%r8, %rax
310	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
311
312	.p2align 4
313L(CopyFrom1To32BytesCase2):
314	add	%rcx, %rsi
315	bsf	%rdx, %rdx
316	add	$16, %rdx
317	sub	%rcx, %rdx
318	cmp	%r8, %rdx
319	jb	L(CopyFrom1To16BytesExit)
320	add	%r8, %rax
321	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
322
323L(CopyFrom1To16BytesTailCase2):
324	add	%rcx, %rsi
325	bsf	%rdx, %rdx
326	cmp	%r8, %rdx
327	jb	L(CopyFrom1To16BytesExit)
328	add	%r8, %rax
329	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
330
331	.p2align 4
332L(CopyFrom1To16BytesTail1Case2):
333	bsf	%rdx, %rdx
334	cmp	%r8, %rdx
335	jb	L(CopyFrom1To16BytesExit)
336	add	%r8, %rax
337	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
338
339/* Case2 or Case3,  Case3 */
340
341	.p2align 4
342L(CopyFrom1To16BytesCase2OrCase3):
343	test	%rdx, %rdx
344	jnz	L(CopyFrom1To16BytesCase2)
345	add	$16, %r8
346	add	%rax, %rdi
347	add	%rax, %rsi
348	add	%r8, %rax
349	sub	%rcx, %rax
350	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
351
352	.p2align 4
353L(CopyFrom1To32BytesCase2OrCase3):
354	test	%rdx, %rdx
355	jnz	L(CopyFrom1To32BytesCase2)
356	add	%rcx, %rsi
357	add	%r8, %rax
358	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
359
360	.p2align 4
361L(CopyFrom1To16BytesTailCase2OrCase3):
362	test	%rdx, %rdx
363	jnz	L(CopyFrom1To16BytesTailCase2)
364	add	%rcx, %rsi
365	add	%r8, %rax
366	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
367
368	.p2align 4
369L(CopyFrom1To32Bytes1Case2OrCase3):
370	add	$16, %rdi
371	add	$16, %rsi
372	sub	$16, %r8
373L(CopyFrom1To16BytesTail1Case2OrCase3):
374	test	%rdx, %rdx
375	jnz	L(CopyFrom1To16BytesTail1Case2)
376	add	%r8, %rax
377	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
378
379	.p2align 4
380L(CopyFrom1To16BytesXmmExit):
381	bsf	%rdx, %rdx
382	add	%rax, %rdi
383	add	%rax, %rsi
384	add	%rdx, %rax
385	sub	%rcx, %rax
386	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
387
388/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
389
390
391	.p2align 4
392L(Exit0):
393	RETURN
394
395	.p2align 4
396L(Exit1):
397	movb	$0, (%rdi)
398	jmp	L(CalculateSrcLen)
399
400	.p2align 4
401L(Exit2):
402	movb	(%rsi), %dh
403	movb	%dh, (%rdi)
404	movb	$0, 1(%rdi)
405	jmp	L(CalculateSrcLen)
406
407	.p2align 4
408L(Exit3):
409	movw	(%rsi), %dx
410	movw	%dx, (%rdi)
411	movb	$0, 2(%rdi)
412	jmp	L(CalculateSrcLen)
413
414	.p2align 4
415L(Exit4):
416	movw	(%rsi), %cx
417	movb	2(%rsi), %dh
418	movw	%cx, (%rdi)
419	movb	%dh, 2(%rdi)
420	movb	$0, 3(%rdi)
421	jmp	L(CalculateSrcLen)
422
423	.p2align 4
424L(Exit5):
425	movl	(%rsi), %edx
426	movl	%edx, (%rdi)
427	movb	$0, 4(%rdi)
428	jmp	L(CalculateSrcLen)
429
430	.p2align 4
431L(Exit6):
432	movl	(%rsi), %ecx
433	movb	4(%rsi), %dh
434	movl	%ecx, (%rdi)
435	movb	%dh, 4(%rdi)
436	movb	$0, 5(%rdi)
437	jmp	L(CalculateSrcLen)
438
439	.p2align 4
440L(Exit7):
441	movl	(%rsi), %ecx
442	movw	4(%rsi), %dx
443	movl	%ecx, (%rdi)
444	movw	%dx, 4(%rdi)
445	movb	$0, 6(%rdi)
446	jmp	L(CalculateSrcLen)
447
448	.p2align 4
449L(Exit8):
450	movl	(%rsi), %ecx
451	movl	3(%rsi), %edx
452	movl	%ecx, (%rdi)
453	movl	%edx, 3(%rdi)
454	movb	$0, 7(%rdi)
455	jmp	L(CalculateSrcLen)
456
457	.p2align 4
458L(Exit9):
459	movq	(%rsi), %rdx
460	movq	%rdx, (%rdi)
461	movb	$0, 8(%rdi)
462	jmp	L(CalculateSrcLen)
463
464	.p2align 4
465L(Exit10):
466	movq	(%rsi), %rcx
467	movb	8(%rsi), %dh
468	movq	%rcx, (%rdi)
469	movb	%dh, 8(%rdi)
470	movb	$0, 9(%rdi)
471	jmp	L(CalculateSrcLen)
472
473	.p2align 4
474L(Exit11):
475	movq	(%rsi), %rcx
476	movw	8(%rsi), %dx
477	movq	%rcx, (%rdi)
478	movw	%dx, 8(%rdi)
479	movb	$0, 10(%rdi)
480	jmp	L(CalculateSrcLen)
481
482	.p2align 4
483L(Exit12):
484	movq	(%rsi), %rcx
485	movl	7(%rsi), %edx
486	movq	%rcx, (%rdi)
487	movl	%edx, 7(%rdi)
488	movb	$0, 11(%rdi)
489	jmp	L(CalculateSrcLen)
490
491	.p2align 4
492L(Exit13):
493	movq	(%rsi), %rcx
494	movl	8(%rsi), %edx
495	movq	%rcx, (%rdi)
496	movl	%edx, 8(%rdi)
497	movb	$0, 12(%rdi)
498	jmp	L(CalculateSrcLen)
499
500	.p2align 4
501L(Exit14):
502	movq	(%rsi), %rcx
503	movq	5(%rsi), %rdx
504	movq	%rcx, (%rdi)
505	movq	%rdx, 5(%rdi)
506	movb	$0, 13(%rdi)
507	jmp	L(CalculateSrcLen)
508
509	.p2align 4
510L(Exit15):
511	movq	(%rsi), %rcx
512	movq	6(%rsi), %rdx
513	movq	%rcx, (%rdi)
514	movq	%rdx, 6(%rdi)
515	movb	$0, 14(%rdi)
516	jmp	L(CalculateSrcLen)
517
518	.p2align 4
519L(Exit16):
520	movq	(%rsi), %rcx
521	movq	7(%rsi), %rdx
522	movq	%rcx, (%rdi)
523	movq	%rdx, 7(%rdi)
524	movb	$0, 15(%rdi)
525	jmp	L(CalculateSrcLen)
526
527	.p2align 4
528L(Exit17):
529	movdqu	(%rsi), %xmm0
530	movdqu	%xmm0, (%rdi)
531	movb	$0, 16(%rdi)
532	jmp	L(CalculateSrcLen)
533
534	.p2align 4
535L(Exit18):
536	movdqu	(%rsi), %xmm0
537	movb	16(%rsi), %dh
538	movdqu	%xmm0, (%rdi)
539	movb	%dh, 16(%rdi)
540	movb	$0, 17(%rdi)
541	jmp	L(CalculateSrcLen)
542
543	.p2align 4
544L(Exit19):
545	movdqu	(%rsi), %xmm0
546	movw	16(%rsi), %cx
547	movdqu	%xmm0, (%rdi)
548	movw	%cx, 16(%rdi)
549	movb	$0, 18(%rdi)
550	jmp	L(CalculateSrcLen)
551
552	.p2align 4
553L(Exit20):
554	movdqu	(%rsi), %xmm0
555	movl	15(%rsi), %ecx
556	movdqu	%xmm0, (%rdi)
557	movl	%ecx, 15(%rdi)
558	movb	$0, 19(%rdi)
559	jmp	L(CalculateSrcLen)
560
561	.p2align 4
562L(Exit21):
563	movdqu	(%rsi), %xmm0
564	movl	16(%rsi), %ecx
565	movdqu	%xmm0, (%rdi)
566	movl	%ecx, 16(%rdi)
567	movb	$0, 20(%rdi)
568	jmp	L(CalculateSrcLen)
569
570	.p2align 4
571L(Exit22):
572	movdqu	(%rsi), %xmm0
573	movl	16(%rsi), %ecx
574	movb	20(%rsi), %dh
575	movdqu	%xmm0, (%rdi)
576	movl	%ecx, 16(%rdi)
577	movb	%dh, 20(%rdi)
578	movb	$0, 21(%rdi)
579	jmp	L(CalculateSrcLen)
580
581	.p2align 4
582L(Exit23):
583	movdqu	(%rsi), %xmm0
584	movq	14(%rsi), %rcx
585	movdqu	%xmm0, (%rdi)
586	movq	%rcx, 14(%rdi)
587	movb	$0, 22(%rdi)
588	jmp	L(CalculateSrcLen)
589
590	.p2align 4
591L(Exit24):
592	movdqu	(%rsi), %xmm0
593	movq	15(%rsi), %rcx
594	movdqu	%xmm0, (%rdi)
595	movq	%rcx, 15(%rdi)
596	movb	$0, 23(%rdi)
597	jmp	L(CalculateSrcLen)
598
599	.p2align 4
600L(Exit25):
601	movdqu	(%rsi), %xmm0
602	movq	16(%rsi), %rcx
603	movdqu	%xmm0, (%rdi)
604	movq	%rcx, 16(%rdi)
605	movb	$0, 24(%rdi)
606	jmp	L(CalculateSrcLen)
607
608	.p2align 4
609L(Exit26):
610	movdqu	(%rsi), %xmm0
611	movq	16(%rsi), %rcx
612	movb	24(%rsi), %dh
613	movdqu	%xmm0, (%rdi)
614	movq	%rcx, 16(%rdi)
615	mov	%dh, 24(%rdi)
616	movb	$0, 25(%rdi)
617	jmp	L(CalculateSrcLen)
618
619	.p2align 4
620L(Exit27):
621	movdqu	(%rsi), %xmm0
622	movq	16(%rsi), %rdx
623	movw	24(%rsi), %cx
624	movdqu	%xmm0, (%rdi)
625	movq	%rdx, 16(%rdi)
626	movw	%cx, 24(%rdi)
627	movb	$0, 26(%rdi)
628	jmp	L(CalculateSrcLen)
629
630	.p2align 4
631L(Exit28):
632	movdqu	(%rsi), %xmm0
633	movq	16(%rsi), %rdx
634	movl	23(%rsi), %ecx
635	movdqu	%xmm0, (%rdi)
636	movq	%rdx, 16(%rdi)
637	movl	%ecx, 23(%rdi)
638	movb	$0, 27(%rdi)
639	jmp	L(CalculateSrcLen)
640
641	.p2align 4
642L(Exit29):
643	movdqu	(%rsi), %xmm0
644	movq	16(%rsi), %rdx
645	movl	24(%rsi), %ecx
646	movdqu	%xmm0, (%rdi)
647	movq	%rdx, 16(%rdi)
648	movl	%ecx, 24(%rdi)
649	movb	$0, 28(%rdi)
650	jmp	L(CalculateSrcLen)
651
652	.p2align 4
653L(Exit30):
654	movdqu	(%rsi), %xmm0
655	movdqu	13(%rsi), %xmm2
656	movdqu	%xmm0, (%rdi)
657	movdqu	%xmm2, 13(%rdi)
658	movb	$0, 29(%rdi)
659	jmp	L(CalculateSrcLen)
660
661	.p2align 4
662L(Exit31):
663	movdqu	(%rsi), %xmm0
664	movdqu	14(%rsi), %xmm2
665	movdqu	%xmm0, (%rdi)
666	movdqu	%xmm2, 14(%rdi)
667	movb	$0, 30(%rdi)
668	jmp	L(CalculateSrcLen)
669
670	.p2align 4
671L(Exit32):
672	movdqu	(%rsi), %xmm0
673	movdqu	15(%rsi), %xmm2
674	movdqu	%xmm0, (%rdi)
675	movdqu	%xmm2, 15(%rdi)
676	movb	$0, 31(%rdi)
677	jmp	L(CalculateSrcLen)
678
679	.p2align 4
680L(StringTail0):
681	mov	(%rsi), %dl
682	mov	%dl, (%rdi)
683	RETURN
684
685	.p2align 4
686L(StringTail1):
687	mov	(%rsi), %dx
688	mov	%dx, (%rdi)
689	RETURN
690
691	.p2align 4
692L(StringTail2):
693	mov	(%rsi), %cx
694	mov	2(%rsi), %dl
695	mov	%cx, (%rdi)
696	mov	%dl, 2(%rdi)
697	RETURN
698
699	.p2align 4
700L(StringTail3):
701	mov	(%rsi), %edx
702	mov	%edx, (%rdi)
703	RETURN
704
705	.p2align 4
706L(StringTail4):
707	mov	(%rsi), %ecx
708	mov	4(%rsi), %dl
709	mov	%ecx, (%rdi)
710	mov	%dl, 4(%rdi)
711	RETURN
712
713	.p2align 4
714L(StringTail5):
715	mov	(%rsi), %ecx
716	mov	4(%rsi), %dx
717	mov	%ecx, (%rdi)
718	mov	%dx, 4(%rdi)
719	RETURN
720
721	.p2align 4
722L(StringTail6):
723	mov	(%rsi), %ecx
724	mov	3(%rsi), %edx
725	mov	%ecx, (%rdi)
726	mov	%edx, 3(%rdi)
727	RETURN
728
729	.p2align 4
730L(StringTail7):
731	mov	(%rsi), %rdx
732	mov	%rdx, (%rdi)
733	RETURN
734
735	.p2align 4
736L(StringTail8):
737	mov	(%rsi), %rcx
738	mov	8(%rsi), %dl
739	mov	%rcx, (%rdi)
740	mov	%dl, 8(%rdi)
741	RETURN
742
743	.p2align 4
744L(StringTail9):
745	mov	(%rsi), %rcx
746	mov	8(%rsi), %dx
747	mov	%rcx, (%rdi)
748	mov	%dx, 8(%rdi)
749	RETURN
750
751	.p2align 4
752L(StringTail10):
753	mov	(%rsi), %rcx
754	mov	7(%rsi), %edx
755	mov	%rcx, (%rdi)
756	mov	%edx, 7(%rdi)
757	RETURN
758
759	.p2align 4
760L(StringTail11):
761	mov	(%rsi), %rcx
762	mov	8(%rsi), %edx
763	mov	%rcx, (%rdi)
764	mov	%edx, 8(%rdi)
765	RETURN
766
767	.p2align 4
768L(StringTail12):
769	mov	(%rsi), %rcx
770	mov	5(%rsi), %rdx
771	mov	%rcx, (%rdi)
772	mov	%rdx, 5(%rdi)
773	RETURN
774
775	.p2align 4
776L(StringTail13):
777	mov	(%rsi), %rcx
778	mov	6(%rsi), %rdx
779	mov	%rcx, (%rdi)
780	mov	%rdx, 6(%rdi)
781	RETURN
782
783	.p2align 4
784L(StringTail14):
785	mov	(%rsi), %rcx
786	mov	7(%rsi), %rdx
787	mov	%rcx, (%rdi)
788	mov	%rdx, 7(%rdi)
789	RETURN
790
791	.p2align 4
792L(StringTail15):
793	movdqu	(%rsi), %xmm0
794	movdqu	%xmm0, (%rdi)
795	RETURN
796
797	.p2align 4
798L(StringTail16):
799	movdqu	(%rsi), %xmm0
800	mov	16(%rsi), %cl
801	movdqu	%xmm0, (%rdi)
802	mov	%cl, 16(%rdi)
803	RETURN
804
805	.p2align 4
806L(StringTail17):
807	movdqu	(%rsi), %xmm0
808	mov	16(%rsi), %cx
809	movdqu	%xmm0, (%rdi)
810	mov	%cx, 16(%rdi)
811	RETURN
812
813	.p2align 4
814L(StringTail18):
815	movdqu	(%rsi), %xmm0
816	mov	15(%rsi), %ecx
817	movdqu	%xmm0, (%rdi)
818	mov	%ecx, 15(%rdi)
819	RETURN
820
821	.p2align 4
822L(StringTail19):
823	movdqu	(%rsi), %xmm0
824	mov	16(%rsi), %ecx
825	movdqu	%xmm0, (%rdi)
826	mov	%ecx, 16(%rdi)
827	RETURN
828
829	.p2align 4
830L(StringTail20):
831	movdqu	(%rsi), %xmm0
832	mov	16(%rsi), %ecx
833	mov	20(%rsi), %dl
834	movdqu	%xmm0, (%rdi)
835	mov	%ecx, 16(%rdi)
836	mov	%dl, 20(%rdi)
837	RETURN
838
839	.p2align 4
840L(StringTail21):
841	movdqu	(%rsi), %xmm0
842	mov	14(%rsi), %rcx
843	movdqu	%xmm0, (%rdi)
844	mov	%rcx, 14(%rdi)
845	RETURN
846
847	.p2align 4
848L(StringTail22):
849	movdqu	(%rsi), %xmm0
850	mov	15(%rsi), %rcx
851	movdqu	%xmm0, (%rdi)
852	mov	%rcx, 15(%rdi)
853	RETURN
854
855	.p2align 4
856L(StringTail23):
857	movdqu	(%rsi), %xmm0
858	mov	16(%rsi), %rcx
859	movdqu	%xmm0, (%rdi)
860	mov	%rcx, 16(%rdi)
861	RETURN
862
863	.p2align 4
864L(StringTail24):
865	movdqu	(%rsi), %xmm0
866	mov	16(%rsi), %rdx
867	mov	24(%rsi), %cl
868	movdqu	%xmm0, (%rdi)
869	mov	%rdx, 16(%rdi)
870	mov	%cl, 24(%rdi)
871	RETURN
872
873	.p2align 4
874L(StringTail25):
875	movdqu	(%rsi), %xmm0
876	mov	16(%rsi), %rdx
877	mov	24(%rsi), %cx
878	movdqu	%xmm0, (%rdi)
879	mov	%rdx, 16(%rdi)
880	mov	%cx, 24(%rdi)
881	RETURN
882
883	.p2align 4
884L(StringTail26):
885	movdqu	(%rsi), %xmm0
886	mov	16(%rsi), %rdx
887	mov	23(%rsi), %ecx
888	movdqu	%xmm0, (%rdi)
889	mov	%rdx, 16(%rdi)
890	mov	%ecx, 23(%rdi)
891	RETURN
892
893	.p2align 4
894L(StringTail27):
895	movdqu	(%rsi), %xmm0
896	mov	16(%rsi), %rdx
897	mov	24(%rsi), %ecx
898	movdqu	%xmm0, (%rdi)
899	mov	%rdx, 16(%rdi)
900	mov	%ecx, 24(%rdi)
901	RETURN
902
903	.p2align 4
904L(StringTail28):
905	movdqu	(%rsi), %xmm0
906	movdqu	13(%rsi), %xmm2
907	movdqu	%xmm0, (%rdi)
908	movdqu	%xmm2, 13(%rdi)
909	RETURN
910
911	.p2align 4
912L(StringTail29):
913	movdqu	(%rsi), %xmm0
914	movdqu	14(%rsi), %xmm2
915	movdqu	%xmm0, (%rdi)
916	movdqu	%xmm2, 14(%rdi)
917	RETURN
918
919	.p2align 4
920L(StringTail30):
921	movdqu	(%rsi), %xmm0
922	movdqu	15(%rsi), %xmm2
923	movdqu	%xmm0, (%rdi)
924	movdqu	%xmm2, 15(%rdi)
925	RETURN
926
927	.p2align 4
928L(StringTail31):
929	movdqu	(%rsi), %xmm0
930	movdqu	16(%rsi), %xmm2
931	movdqu	%xmm0, (%rdi)
932	movdqu	%xmm2, 16(%rdi)
933	RETURN
934
935	.p2align 4
936L(StringTail32):
937	movdqu	(%rsi), %xmm0
938	movdqu	16(%rsi), %xmm2
939	mov	32(%rsi), %cl
940	movdqu	%xmm0, (%rdi)
941	movdqu	%xmm2, 16(%rdi)
942	mov	%cl, 32(%rdi)
943	RETURN
944
945	.p2align 4
946L(StringTail33):
947	movdqu	(%rsi), %xmm0
948	movdqu	16(%rsi), %xmm2
949	mov	32(%rsi), %cl
950	movdqu	%xmm0, (%rdi)
951	movdqu	%xmm2, 16(%rdi)
952	mov	%cl, 32(%rdi)
953	RETURN
954
955	.p2align 4
956L(CalculateSrcLenCase1):
957	xor	%r8, %r8
958	xor	%rax, %rax
959L(CalculateSrcLen):
960	pxor	%xmm0, %xmm0
961	xor	%rcx, %rcx
962	add	%r8, %rsi
963	movdqu	(%rsi), %xmm1
964	pcmpeqb	%xmm1, %xmm0
965	pmovmskb %xmm0, %rdx
966	test	%rdx, %rdx
967	jnz	L(SrcLenLoopEnd)
968
969	add	%rax, %r9
970	mov	$16, %rax
971	mov	%rsi, %rcx
972	and	$15, %rcx
973	and	$-16, %rsi
974L(SrcLenLoop):
975	movdqa	(%rsi, %rax), %xmm1
976	pcmpeqb	%xmm1, %xmm0
977	pmovmskb %xmm0, %rdx
978	test	%rdx, %rdx
979	jnz	L(SrcLenLoopEnd)
980	add	$16, %rax
981	jmp	L(SrcLenLoop)
982
983	.p2align 4
984L(SrcLenLoopEnd):
985	bsf	%rdx, %rdx
986	add	%rdx, %rax
987	sub	%rcx, %rax
988	RETURN
989
990END (STRLCPY)
991
992	.p2align 4
993	.section .rodata
994L(ExitTable):
995	.int	JMPTBL(L(Exit0), L(ExitTable))
996	.int	JMPTBL(L(Exit1), L(ExitTable))
997	.int	JMPTBL(L(Exit2), L(ExitTable))
998	.int	JMPTBL(L(Exit3), L(ExitTable))
999	.int	JMPTBL(L(Exit4), L(ExitTable))
1000	.int	JMPTBL(L(Exit5), L(ExitTable))
1001	.int	JMPTBL(L(Exit6), L(ExitTable))
1002	.int	JMPTBL(L(Exit7), L(ExitTable))
1003	.int	JMPTBL(L(Exit8), L(ExitTable))
1004	.int	JMPTBL(L(Exit9), L(ExitTable))
1005	.int	JMPTBL(L(Exit10), L(ExitTable))
1006	.int	JMPTBL(L(Exit11), L(ExitTable))
1007	.int	JMPTBL(L(Exit12), L(ExitTable))
1008	.int	JMPTBL(L(Exit13), L(ExitTable))
1009	.int	JMPTBL(L(Exit14), L(ExitTable))
1010	.int	JMPTBL(L(Exit15), L(ExitTable))
1011	.int	JMPTBL(L(Exit16), L(ExitTable))
1012	.int	JMPTBL(L(Exit17), L(ExitTable))
1013	.int	JMPTBL(L(Exit18), L(ExitTable))
1014	.int	JMPTBL(L(Exit19), L(ExitTable))
1015	.int	JMPTBL(L(Exit20), L(ExitTable))
1016	.int	JMPTBL(L(Exit21), L(ExitTable))
1017	.int	JMPTBL(L(Exit22), L(ExitTable))
1018	.int	JMPTBL(L(Exit23), L(ExitTable))
1019	.int	JMPTBL(L(Exit24), L(ExitTable))
1020	.int	JMPTBL(L(Exit25), L(ExitTable))
1021	.int	JMPTBL(L(Exit26), L(ExitTable))
1022	.int	JMPTBL(L(Exit27), L(ExitTable))
1023	.int	JMPTBL(L(Exit28), L(ExitTable))
1024	.int	JMPTBL(L(Exit29), L(ExitTable))
1025	.int	JMPTBL(L(Exit30), L(ExitTable))
1026	.int	JMPTBL(L(Exit31), L(ExitTable))
1027	.int	JMPTBL(L(Exit32), L(ExitTable))
1028L(ExitStringTailTable):
1029	.int	JMPTBL(L(StringTail0), L(ExitStringTailTable))
1030	.int	JMPTBL(L(StringTail1), L(ExitStringTailTable))
1031	.int	JMPTBL(L(StringTail2), L(ExitStringTailTable))
1032	.int	JMPTBL(L(StringTail3), L(ExitStringTailTable))
1033	.int	JMPTBL(L(StringTail4), L(ExitStringTailTable))
1034	.int	JMPTBL(L(StringTail5), L(ExitStringTailTable))
1035	.int	JMPTBL(L(StringTail6), L(ExitStringTailTable))
1036	.int	JMPTBL(L(StringTail7), L(ExitStringTailTable))
1037	.int	JMPTBL(L(StringTail8), L(ExitStringTailTable))
1038	.int	JMPTBL(L(StringTail9), L(ExitStringTailTable))
1039	.int	JMPTBL(L(StringTail10), L(ExitStringTailTable))
1040	.int	JMPTBL(L(StringTail11), L(ExitStringTailTable))
1041	.int	JMPTBL(L(StringTail12), L(ExitStringTailTable))
1042	.int	JMPTBL(L(StringTail13), L(ExitStringTailTable))
1043	.int	JMPTBL(L(StringTail14), L(ExitStringTailTable))
1044	.int	JMPTBL(L(StringTail15), L(ExitStringTailTable))
1045	.int	JMPTBL(L(StringTail16), L(ExitStringTailTable))
1046	.int	JMPTBL(L(StringTail17), L(ExitStringTailTable))
1047	.int	JMPTBL(L(StringTail18), L(ExitStringTailTable))
1048	.int	JMPTBL(L(StringTail19), L(ExitStringTailTable))
1049	.int	JMPTBL(L(StringTail20), L(ExitStringTailTable))
1050	.int	JMPTBL(L(StringTail21), L(ExitStringTailTable))
1051	.int	JMPTBL(L(StringTail22), L(ExitStringTailTable))
1052	.int	JMPTBL(L(StringTail23), L(ExitStringTailTable))
1053	.int	JMPTBL(L(StringTail24), L(ExitStringTailTable))
1054	.int	JMPTBL(L(StringTail25), L(ExitStringTailTable))
1055	.int	JMPTBL(L(StringTail26), L(ExitStringTailTable))
1056	.int	JMPTBL(L(StringTail27), L(ExitStringTailTable))
1057	.int	JMPTBL(L(StringTail28), L(ExitStringTailTable))
1058	.int	JMPTBL(L(StringTail29), L(ExitStringTailTable))
1059	.int	JMPTBL(L(StringTail30), L(ExitStringTailTable))
1060	.int	JMPTBL(L(StringTail31), L(ExitStringTailTable))
1061	.int	JMPTBL(L(StringTail32), L(ExitStringTailTable))
1062	.int	JMPTBL(L(StringTail33), L(ExitStringTailTable))
1063