1/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc			.cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc			.cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg)		.cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
53#endif
54
55#ifndef cfi_remember_state
56# define cfi_remember_state		.cfi_remember_state
57#endif
58
59#ifndef cfi_restore_state
60# define cfi_restore_state		.cfi_restore_state
61#endif
62
63#ifndef ENTRY
64# define ENTRY(name)			\
65	.type name,  @function; 	\
66	.globl name;			\
67	.p2align 4;			\
68name:					\
69	cfi_startproc
70#endif
71
72#ifndef END
73# define END(name)			\
74	cfi_endproc;			\
75	.size name, .-name
76#endif
77
78#define CFI_PUSH(REG)			\
79  cfi_adjust_cfa_offset (4);		\
80  cfi_rel_offset (REG, 0)
81
82#define CFI_POP(REG)			\
83  cfi_adjust_cfa_offset (-4);		\
84  cfi_restore (REG)
85
86#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
87#define POP(REG)	popl REG; CFI_POP (REG)
88
89#ifndef STRCAT
90# define STRCAT	strcat
91#endif
92
93#define PARMS	4
94#define STR1	PARMS+4
95#define STR2	STR1+4
96
97#ifdef USE_AS_STRNCAT
98# define LEN	STR2+8
99#endif
100
101#define USE_AS_STRCAT
102
103	.section .text.ssse3,"ax",@progbits
104ENTRY (STRCAT)
105	PUSH	(%edi)
106	mov	STR1(%esp), %edi
107	mov	%edi, %edx
108
109#define RETURN	jmp	L(StrcpyAtom)
110#include "sse2-strlen-atom.S"
111
112L(StrcpyAtom):
113	mov	STR2(%esp), %ecx
114	lea	(%edi, %eax), %edx
115#ifdef USE_AS_STRNCAT
116	PUSH	(%ebx)
117	mov	LEN(%esp), %ebx
118	test	%ebx, %ebx
119	jz	L(StrncatExit0)
120	cmp	$8, %ebx
121	jbe	L(StrncpyExit8Bytes)
122#endif
123	cmpb	$0, (%ecx)
124	jz	L(Exit1)
125	cmpb	$0, 1(%ecx)
126	jz	L(Exit2)
127	cmpb	$0, 2(%ecx)
128	jz	L(Exit3)
129	cmpb	$0, 3(%ecx)
130	jz	L(Exit4)
131	cmpb	$0, 4(%ecx)
132	jz	L(Exit5)
133	cmpb	$0, 5(%ecx)
134	jz	L(Exit6)
135	cmpb	$0, 6(%ecx)
136	jz	L(Exit7)
137	cmpb	$0, 7(%ecx)
138	jz	L(Exit8)
139	cmpb	$0, 8(%ecx)
140	jz	L(Exit9)
141#ifdef USE_AS_STRNCAT
142	cmp	$16, %ebx
143	jb	L(StrncpyExit15Bytes)
144#endif
145	cmpb	$0, 9(%ecx)
146	jz	L(Exit10)
147	cmpb	$0, 10(%ecx)
148	jz	L(Exit11)
149	cmpb	$0, 11(%ecx)
150	jz	L(Exit12)
151	cmpb	$0, 12(%ecx)
152	jz	L(Exit13)
153	cmpb	$0, 13(%ecx)
154	jz	L(Exit14)
155	cmpb	$0, 14(%ecx)
156	jz	L(Exit15)
157	cmpb	$0, 15(%ecx)
158	jz	L(Exit16)
159#ifdef USE_AS_STRNCAT
160	cmp	$16, %ebx
161	je	L(StrncatExit16)
162
163# define RETURN1	POP (%ebx); POP (%edi);	ret; \
164	CFI_PUSH (%ebx); CFI_PUSH (%edi)
165# define USE_AS_STRNCPY
166#else
167# define RETURN1	POP(%edi); ret; CFI_PUSH(%edi)
168#endif
169#include "ssse3-strcpy-atom.S"
170
171	.p2align 4
172L(CopyFrom1To16Bytes):
173	add	%esi, %edx
174	add	%esi, %ecx
175
176	POP	(%esi)
177	test	%al, %al
178	jz	L(ExitHigh)
179	test	$0x01, %al
180	jnz	L(Exit1)
181	test	$0x02, %al
182	jnz	L(Exit2)
183	test	$0x04, %al
184	jnz	L(Exit3)
185	test	$0x08, %al
186	jnz	L(Exit4)
187	test	$0x10, %al
188	jnz	L(Exit5)
189	test	$0x20, %al
190	jnz	L(Exit6)
191	test	$0x40, %al
192	jnz	L(Exit7)
193	movlpd	(%ecx), %xmm0
194	movlpd	%xmm0, (%edx)
195	movl	%edi, %eax
196	RETURN1
197
198	.p2align 4
199L(ExitHigh):
200	test	$0x01, %ah
201	jnz	L(Exit9)
202	test	$0x02, %ah
203	jnz	L(Exit10)
204	test	$0x04, %ah
205	jnz	L(Exit11)
206	test	$0x08, %ah
207	jnz	L(Exit12)
208	test	$0x10, %ah
209	jnz	L(Exit13)
210	test	$0x20, %ah
211	jnz	L(Exit14)
212	test	$0x40, %ah
213	jnz	L(Exit15)
214	movlpd	(%ecx), %xmm0
215	movlpd	8(%ecx), %xmm1
216	movlpd	%xmm0, (%edx)
217	movlpd	%xmm1, 8(%edx)
218	movl	%edi, %eax
219	RETURN1
220
221	.p2align 4
222L(StrncatExit1):
223	movb	%bh, 1(%edx)
224L(Exit1):
225	movb	(%ecx), %al
226	movb	%al, (%edx)
227	movl	%edi, %eax
228	RETURN1
229
230	.p2align 4
231L(StrncatExit2):
232	movb	%bh, 2(%edx)
233L(Exit2):
234	movw	(%ecx), %ax
235	movw	%ax, (%edx)
236	movl	%edi, %eax
237	RETURN1
238
239	.p2align 4
240L(StrncatExit3):
241	movb	%bh, 3(%edx)
242L(Exit3):
243	movw	(%ecx), %ax
244	movw	%ax, (%edx)
245	movb	2(%ecx), %al
246	movb	%al, 2(%edx)
247	movl	%edi, %eax
248	RETURN1
249
250	.p2align 4
251L(StrncatExit4):
252	movb	%bh, 4(%edx)
253L(Exit4):
254	movl	(%ecx), %eax
255	movl	%eax, (%edx)
256	movl	%edi, %eax
257	RETURN1
258
259	.p2align 4
260L(StrncatExit5):
261	movb	%bh, 5(%edx)
262L(Exit5):
263	movl	(%ecx), %eax
264	movl	%eax, (%edx)
265	movb	4(%ecx), %al
266	movb	%al, 4(%edx)
267	movl	%edi, %eax
268	RETURN1
269
270	.p2align 4
271L(StrncatExit6):
272	movb	%bh, 6(%edx)
273L(Exit6):
274	movl	(%ecx), %eax
275	movl	%eax, (%edx)
276	movw	4(%ecx), %ax
277	movw	%ax, 4(%edx)
278	movl	%edi, %eax
279	RETURN1
280
281	.p2align 4
282L(StrncatExit7):
283	movb	%bh, 7(%edx)
284L(Exit7):
285	movl	(%ecx), %eax
286	movl	%eax, (%edx)
287	movl	3(%ecx), %eax
288	movl	%eax, 3(%edx)
289	movl	%edi, %eax
290	RETURN1
291
292	.p2align 4
293L(StrncatExit8):
294	movb	%bh, 8(%edx)
295L(Exit8):
296	movlpd	(%ecx), %xmm0
297	movlpd	%xmm0, (%edx)
298	movl	%edi, %eax
299	RETURN1
300
301	.p2align 4
302L(StrncatExit9):
303	movb	%bh, 9(%edx)
304L(Exit9):
305	movlpd	(%ecx), %xmm0
306	movlpd	%xmm0, (%edx)
307	movb	8(%ecx), %al
308	movb	%al, 8(%edx)
309	movl	%edi, %eax
310	RETURN1
311
312	.p2align 4
313L(StrncatExit10):
314	movb	%bh, 10(%edx)
315L(Exit10):
316	movlpd	(%ecx), %xmm0
317	movlpd	%xmm0, (%edx)
318	movw	8(%ecx), %ax
319	movw	%ax, 8(%edx)
320	movl	%edi, %eax
321	RETURN1
322
323	.p2align 4
324L(StrncatExit11):
325	movb	%bh, 11(%edx)
326L(Exit11):
327	movlpd	(%ecx), %xmm0
328	movlpd	%xmm0, (%edx)
329	movl	7(%ecx), %eax
330	movl	%eax, 7(%edx)
331	movl	%edi, %eax
332	RETURN1
333
334	.p2align 4
335L(StrncatExit12):
336	movb	%bh, 12(%edx)
337L(Exit12):
338	movlpd	(%ecx), %xmm0
339	movlpd	%xmm0, (%edx)
340	movl	8(%ecx), %eax
341	movl	%eax, 8(%edx)
342	movl	%edi, %eax
343	RETURN1
344
345	.p2align 4
346L(StrncatExit13):
347	movb	%bh, 13(%edx)
348L(Exit13):
349	movlpd	(%ecx), %xmm0
350	movlpd	%xmm0, (%edx)
351	movlpd	5(%ecx), %xmm0
352	movlpd	%xmm0, 5(%edx)
353	movl	%edi, %eax
354	RETURN1
355
356	.p2align 4
357L(StrncatExit14):
358	movb	%bh, 14(%edx)
359L(Exit14):
360	movlpd	(%ecx), %xmm0
361	movlpd	%xmm0, (%edx)
362	movlpd	6(%ecx), %xmm0
363	movlpd	%xmm0, 6(%edx)
364	movl	%edi, %eax
365	RETURN1
366
367	.p2align 4
368L(StrncatExit15):
369	movb	%bh, 15(%edx)
370L(Exit15):
371	movlpd	(%ecx), %xmm0
372	movlpd	%xmm0, (%edx)
373	movlpd	7(%ecx), %xmm0
374	movlpd	%xmm0, 7(%edx)
375	movl	%edi, %eax
376	RETURN1
377
378	.p2align 4
379L(StrncatExit16):
380	movb	%bh, 16(%edx)
381L(Exit16):
382	movlpd	(%ecx), %xmm0
383	movlpd	8(%ecx), %xmm1
384	movlpd	%xmm0, (%edx)
385	movlpd	%xmm1, 8(%edx)
386	movl	%edi, %eax
387	RETURN1
388
389#ifdef USE_AS_STRNCPY
390
391	CFI_PUSH(%esi)
392
393	.p2align 4
394L(CopyFrom1To16BytesCase2):
395	add	$16, %ebx
396	add	%esi, %ecx
397	lea	(%esi, %edx), %esi
398	lea	-9(%ebx), %edx
399	and	$1<<7, %dh
400	or	%al, %dh
401	lea	(%esi), %edx
402	POP	(%esi)
403	jz	L(ExitHighCase2)
404
405	test	$0x01, %al
406	jnz	L(Exit1)
407	cmp	$1, %ebx
408	je	L(StrncatExit1)
409	test	$0x02, %al
410	jnz	L(Exit2)
411	cmp	$2, %ebx
412	je	L(StrncatExit2)
413	test	$0x04, %al
414	jnz	L(Exit3)
415	cmp	$3, %ebx
416	je	L(StrncatExit3)
417	test	$0x08, %al
418	jnz	L(Exit4)
419	cmp	$4, %ebx
420	je	L(StrncatExit4)
421	test	$0x10, %al
422	jnz	L(Exit5)
423	cmp	$5, %ebx
424	je	L(StrncatExit5)
425	test	$0x20, %al
426	jnz	L(Exit6)
427	cmp	$6, %ebx
428	je	L(StrncatExit6)
429	test	$0x40, %al
430	jnz	L(Exit7)
431	cmp	$7, %ebx
432	je	L(StrncatExit7)
433	movlpd	(%ecx), %xmm0
434	movlpd	%xmm0, (%edx)
435	lea	7(%edx), %eax
436	cmpb	$1, (%eax)
437	sbb	$-1, %eax
438	xor	%cl, %cl
439	movb	%cl, (%eax)
440	movl	%edi, %eax
441	RETURN1
442
443	.p2align 4
444L(ExitHighCase2):
445	test	$0x01, %ah
446	jnz	L(Exit9)
447	cmp	$9, %ebx
448	je	L(StrncatExit9)
449	test	$0x02, %ah
450	jnz	L(Exit10)
451	cmp	$10, %ebx
452	je	L(StrncatExit10)
453	test	$0x04, %ah
454	jnz	L(Exit11)
455	cmp	$11, %ebx
456	je	L(StrncatExit11)
457	test	$0x8, %ah
458	jnz	L(Exit12)
459	cmp	$12, %ebx
460	je	L(StrncatExit12)
461	test	$0x10, %ah
462	jnz	L(Exit13)
463	cmp	$13, %ebx
464	je	L(StrncatExit13)
465	test	$0x20, %ah
466	jnz	L(Exit14)
467	cmp	$14, %ebx
468	je	L(StrncatExit14)
469	test	$0x40, %ah
470	jnz	L(Exit15)
471	cmp	$15, %ebx
472	je	L(StrncatExit15)
473	movlpd	(%ecx), %xmm0
474	movlpd	%xmm0, (%edx)
475	movlpd	8(%ecx), %xmm1
476	movlpd	%xmm1, 8(%edx)
477	movl	%edi, %eax
478	RETURN1
479
480	CFI_PUSH(%esi)
481
482L(CopyFrom1To16BytesCase2OrCase3):
483	test	%eax, %eax
484	jnz	L(CopyFrom1To16BytesCase2)
485
486	.p2align 4
487L(CopyFrom1To16BytesCase3):
488	add	$16, %ebx
489	add	%esi, %edx
490	add	%esi, %ecx
491
492	POP	(%esi)
493
494	cmp	$8, %ebx
495	ja	L(ExitHighCase3)
496	cmp	$1, %ebx
497	je	L(StrncatExit1)
498	cmp	$2, %ebx
499	je	L(StrncatExit2)
500	cmp	$3, %ebx
501	je	L(StrncatExit3)
502	cmp	$4, %ebx
503	je	L(StrncatExit4)
504	cmp	$5, %ebx
505	je	L(StrncatExit5)
506	cmp	$6, %ebx
507	je	L(StrncatExit6)
508	cmp	$7, %ebx
509	je	L(StrncatExit7)
510	movlpd	(%ecx), %xmm0
511	movlpd	%xmm0, (%edx)
512	movb	%bh, 8(%edx)
513	movl	%edi, %eax
514	RETURN1
515
516	.p2align 4
517L(ExitHighCase3):
518	cmp	$9, %ebx
519	je	L(StrncatExit9)
520	cmp	$10, %ebx
521	je	L(StrncatExit10)
522	cmp	$11, %ebx
523	je	L(StrncatExit11)
524	cmp	$12, %ebx
525	je	L(StrncatExit12)
526	cmp	$13, %ebx
527	je	L(StrncatExit13)
528	cmp	$14, %ebx
529	je	L(StrncatExit14)
530	cmp	$15, %ebx
531	je	L(StrncatExit15)
532	movlpd	(%ecx), %xmm0
533	movlpd	%xmm0, (%edx)
534	movlpd	8(%ecx), %xmm1
535	movlpd	%xmm1, 8(%edx)
536	movb	%bh, 16(%edx)
537	movl	%edi, %eax
538	RETURN1
539
540	.p2align 4
541L(StrncatExit0):
542	movl	%edi, %eax
543	RETURN1
544
545	.p2align 4
546L(StrncpyExit15Bytes):
547	cmp	$9, %ebx
548	je	L(StrncatExit9)
549	cmpb	$0, 9(%ecx)
550	jz	L(Exit10)
551	cmp	$10, %ebx
552	je	L(StrncatExit10)
553	cmpb	$0, 10(%ecx)
554	jz	L(Exit11)
555	cmp	$11, %ebx
556	je	L(StrncatExit11)
557	cmpb	$0, 11(%ecx)
558	jz	L(Exit12)
559	cmp	$12, %ebx
560	je	L(StrncatExit12)
561	cmpb	$0, 12(%ecx)
562	jz	L(Exit13)
563	cmp	$13, %ebx
564	je	L(StrncatExit13)
565	cmpb	$0, 13(%ecx)
566	jz	L(Exit14)
567	cmp	$14, %ebx
568	je	L(StrncatExit14)
569	movlpd	(%ecx), %xmm0
570	movlpd	%xmm0, (%edx)
571	movlpd	7(%ecx), %xmm0
572	movlpd	%xmm0, 7(%edx)
573	lea	14(%edx), %eax
574	cmpb	$1, (%eax)
575	sbb	$-1, %eax
576	movb	%bh, (%eax)
577	movl	%edi, %eax
578	RETURN1
579
580	.p2align 4
581L(StrncpyExit8Bytes):
582	cmpb	$0, (%ecx)
583	jz	L(Exit1)
584	cmp	$1, %ebx
585	je	L(StrncatExit1)
586	cmpb	$0, 1(%ecx)
587	jz	L(Exit2)
588	cmp	$2, %ebx
589	je	L(StrncatExit2)
590	cmpb	$0, 2(%ecx)
591	jz	L(Exit3)
592	cmp	$3, %ebx
593	je	L(StrncatExit3)
594	cmpb	$0, 3(%ecx)
595	jz	L(Exit4)
596	cmp	$4, %ebx
597	je	L(StrncatExit4)
598	cmpb	$0, 4(%ecx)
599	jz	L(Exit5)
600	cmp	$5, %ebx
601	je	L(StrncatExit5)
602	cmpb	$0, 5(%ecx)
603	jz	L(Exit6)
604	cmp	$6, %ebx
605	je	L(StrncatExit6)
606	cmpb	$0, 6(%ecx)
607	jz	L(Exit7)
608	cmp	$7, %ebx
609	je	L(StrncatExit7)
610	movlpd	(%ecx), %xmm0
611	movlpd	%xmm0, (%edx)
612	lea	7(%edx), %eax
613	cmpb	$1, (%eax)
614	sbb	$-1, %eax
615	movb	%bh, (%eax)
616	movl	%edi, %eax
617	RETURN1
618
619#endif
620END (STRCAT)
621