1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#define FOR_ATOM
32#include "cache.h"
33
34#ifndef MEMCPY
35# define MEMCPY	memcpy_atom
36#endif
37
38#ifndef L
39# define L(label)	.L##label
40#endif
41
42#ifndef cfi_startproc
43# define cfi_startproc	.cfi_startproc
44#endif
45
46#ifndef cfi_endproc
47# define cfi_endproc	.cfi_endproc
48#endif
49
50#ifndef cfi_rel_offset
51# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
52#endif
53
54#ifndef cfi_restore
55# define cfi_restore(reg)	.cfi_restore reg
56#endif
57
58#ifndef cfi_adjust_cfa_offset
59# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
60#endif
61
62#ifndef ENTRY
63# define ENTRY(name)		\
64	.type name,  @function;		\
65	.globl name;		\
66	.p2align 4;		\
67name:		\
68	cfi_startproc
69#endif
70
71#ifndef END
72# define END(name)		\
73	cfi_endproc;		\
74	.size name, .-name
75#endif
76
77#define DEST		PARMS
78#define SRC		DEST+4
79#define LEN		SRC+4
80
81#define CFI_PUSH(REG)		\
82  cfi_adjust_cfa_offset (4);		\
83  cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG)		\
86  cfi_adjust_cfa_offset (-4);		\
87  cfi_restore (REG)
88
89#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
90#define POP(REG)	popl REG; CFI_POP (REG)
91
92#if (defined SHARED || defined __PIC__)
93# define PARMS		8		/* Preserve EBX.  */
94# define ENTRANCE	PUSH (%ebx);
95# define RETURN_END	POP (%ebx); ret
96# define RETURN		RETURN_END; CFI_PUSH (%ebx)
97# define JMPTBL(I, B)	I - B
98
99# define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
100
101/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
102	jump table with relative offsets.  INDEX is a register contains the
103	index into the jump table.   SCALE is the scale of INDEX. */
104
105# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
106    /* We first load PC into EBX.  */		\
107	SETUP_PIC_REG(bx);		\
108    /* Get the address of the jump table.  */		\
109	addl	$(TABLE - .), %ebx;		\
110    /* Get the entry and convert the relative offset to the		\
111	absolute	address.  */		\
112	addl	(%ebx, INDEX, SCALE), %ebx;		\
113    /* We loaded the jump table.  Go.  */		\
114	jmp	*%ebx
115#else
116
117# define PARMS		4
118# define ENTRANCE
119# define RETURN_END	ret
120# define RETURN		RETURN_END
121# define JMPTBL(I, B)	I
122
123/* Branch to an entry in a jump table.  TABLE is a jump table with
124	absolute offsets.  INDEX is a register contains the index into the
125	jump table.  SCALE is the scale of INDEX. */
126
127# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
128	jmp	*TABLE(, INDEX, SCALE)
129#endif
130
131	.section .text.ssse3,"ax",@progbits
132ENTRY (MEMCPY)
133	ENTRANCE
134	movl	LEN(%esp), %ecx
135	movl	SRC(%esp), %eax
136	movl	DEST(%esp), %edx
137
138#ifdef USE_AS_MEMMOVE
139	cmp	%eax, %edx
140	jb	L(copy_forward)
141	je	L(fwd_write_0bytes)
142	cmp	$32, %ecx
143	jae	L(memmove_bwd)
144	jmp	L(bk_write_less32bytes_2)
145
146	.p2align 4
147L(memmove_bwd):
148	add	%ecx, %eax
149	cmp	%eax, %edx
150	movl	SRC(%esp), %eax
151	jb	L(copy_backward)
152
153L(copy_forward):
154#endif
155	cmp	$48, %ecx
156	jae	L(48bytesormore)
157
158L(fwd_write_less32bytes):
159#ifndef USE_AS_MEMMOVE
160	cmp	%dl, %al
161	jb	L(bk_write)
162#endif
163	add	%ecx, %edx
164	add	%ecx, %eax
165	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
166#ifndef USE_AS_MEMMOVE
167	.p2align 4
168L(bk_write):
169	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
170#endif
171
172	.p2align 4
173L(48bytesormore):
174#ifndef USE_AS_MEMMOVE
175	movlpd	(%eax), %xmm0
176	movlpd	8(%eax), %xmm1
177	movlpd	%xmm0, (%edx)
178	movlpd	%xmm1, 8(%edx)
179#else
180	movdqu	(%eax), %xmm0
181#endif
182	PUSH (%edi)
183	movl	%edx, %edi
184	and	$-16, %edx
185	add	$16, %edx
186	sub	%edx, %edi
187	add	%edi, %ecx
188	sub	%edi, %eax
189
190#ifdef SHARED_CACHE_SIZE_HALF
191	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
192#else
193# if (defined SHARED || defined __PIC__)
194	SETUP_PIC_REG(bx)
195	add	$_GLOBAL_OFFSET_TABLE_, %ebx
196	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
197# else
198	cmp	__x86_shared_cache_size_half, %ecx
199# endif
200#endif
201
202	mov	%eax, %edi
203	jae	L(large_page)
204	and	$0xf, %edi
205	jz	L(shl_0)
206	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
207
208	.p2align 4
209L(shl_0):
210#ifdef USE_AS_MEMMOVE
211	movl	DEST+4(%esp), %edi
212	movdqu	%xmm0, (%edi)
213#endif
214	xor	%edi, %edi
215	cmp	$127, %ecx
216	ja	L(shl_0_gobble)
217	lea	-32(%ecx), %ecx
218
219	.p2align 4
220L(shl_0_loop):
221	movdqa	(%eax, %edi), %xmm0
222	movdqa	16(%eax, %edi), %xmm1
223	sub	$32, %ecx
224	movdqa	%xmm0, (%edx, %edi)
225	movdqa	%xmm1, 16(%edx, %edi)
226	lea	32(%edi), %edi
227	jb	L(shl_0_end)
228
229	movdqa	(%eax, %edi), %xmm0
230	movdqa	16(%eax, %edi), %xmm1
231	sub	$32, %ecx
232	movdqa	%xmm0, (%edx, %edi)
233	movdqa	%xmm1, 16(%edx, %edi)
234	lea	32(%edi), %edi
235	jb	L(shl_0_end)
236
237	movdqa	(%eax, %edi), %xmm0
238	movdqa	16(%eax, %edi), %xmm1
239	sub	$32, %ecx
240	movdqa	%xmm0, (%edx, %edi)
241	movdqa	%xmm1, 16(%edx, %edi)
242	lea	32(%edi), %edi
243	jb	L(shl_0_end)
244
245	movdqa	(%eax, %edi), %xmm0
246	movdqa	16(%eax, %edi), %xmm1
247	sub	$32, %ecx
248	movdqa	%xmm0, (%edx, %edi)
249	movdqa	%xmm1, 16(%edx, %edi)
250	lea	32(%edi), %edi
251
252L(shl_0_end):
253	lea	32(%ecx), %ecx
254	add	%ecx, %edi
255	add	%edi, %edx
256	add	%edi, %eax
257	POP (%edi)
258	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
259
260	CFI_PUSH (%edi)
261
262	.p2align 4
263L(shl_0_gobble):
264#ifdef DATA_CACHE_SIZE_HALF
265	cmp	$DATA_CACHE_SIZE_HALF, %ecx
266#else
267# if (defined SHARED || defined __PIC__)
268	SETUP_PIC_REG(bx)
269	add	$_GLOBAL_OFFSET_TABLE_, %ebx
270	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
271# else
272	cmp	__x86_data_cache_size_half, %ecx
273# endif
274#endif
275	POP	(%edi)
276	lea	-128(%ecx), %ecx
277	jae	L(shl_0_gobble_mem_loop)
278
279	.p2align 4
280L(shl_0_gobble_cache_loop):
281	movdqa	(%eax), %xmm0
282	movdqa	0x10(%eax), %xmm1
283	movdqa	0x20(%eax), %xmm2
284	movdqa	0x30(%eax), %xmm3
285	movdqa	0x40(%eax), %xmm4
286	movdqa	0x50(%eax), %xmm5
287	movdqa	0x60(%eax), %xmm6
288	movdqa	0x70(%eax), %xmm7
289	lea	0x80(%eax), %eax
290	sub	$128, %ecx
291	movdqa	%xmm0, (%edx)
292	movdqa	%xmm1, 0x10(%edx)
293	movdqa	%xmm2, 0x20(%edx)
294	movdqa	%xmm3, 0x30(%edx)
295	movdqa	%xmm4, 0x40(%edx)
296	movdqa	%xmm5, 0x50(%edx)
297	movdqa	%xmm6, 0x60(%edx)
298	movdqa	%xmm7, 0x70(%edx)
299	lea	0x80(%edx), %edx
300
301	jae	L(shl_0_gobble_cache_loop)
302	cmp	$-0x40, %ecx
303	lea	0x80(%ecx), %ecx
304	jl	L(shl_0_cache_less_64bytes)
305
306	movdqa	(%eax), %xmm0
307	sub	$0x40, %ecx
308	movdqa	0x10(%eax), %xmm1
309	movdqa	%xmm0, (%edx)
310	movdqa	%xmm1, 0x10(%edx)
311	movdqa	0x20(%eax), %xmm0
312	movdqa	0x30(%eax), %xmm1
313	add	$0x40, %eax
314	movdqa	%xmm0, 0x20(%edx)
315	movdqa	%xmm1, 0x30(%edx)
316	add	$0x40, %edx
317
318L(shl_0_cache_less_64bytes):
319	cmp	$0x20, %ecx
320	jb	L(shl_0_cache_less_32bytes)
321	movdqa	(%eax), %xmm0
322	sub	$0x20, %ecx
323	movdqa	0x10(%eax), %xmm1
324	add	$0x20, %eax
325	movdqa	%xmm0, (%edx)
326	movdqa	%xmm1, 0x10(%edx)
327	add	$0x20, %edx
328
329L(shl_0_cache_less_32bytes):
330	cmp	$0x10, %ecx
331	jb	L(shl_0_cache_less_16bytes)
332	sub	$0x10, %ecx
333	movdqa	(%eax), %xmm0
334	add	$0x10, %eax
335	movdqa	%xmm0, (%edx)
336	add	$0x10, %edx
337
338L(shl_0_cache_less_16bytes):
339	add	%ecx, %edx
340	add	%ecx, %eax
341	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
342
343	.p2align 4
344L(shl_0_gobble_mem_loop):
345	prefetcht0 0x1c0(%eax)
346	prefetcht0 0x280(%eax)
347	prefetcht0 0x1c0(%edx)
348
349	movdqa	(%eax), %xmm0
350	movdqa	0x10(%eax), %xmm1
351	movdqa	0x20(%eax), %xmm2
352	movdqa	0x30(%eax), %xmm3
353	movdqa	0x40(%eax), %xmm4
354	movdqa	0x50(%eax), %xmm5
355	movdqa	0x60(%eax), %xmm6
356	movdqa	0x70(%eax), %xmm7
357	lea	0x80(%eax), %eax
358	sub	$0x80, %ecx
359	movdqa	%xmm0, (%edx)
360	movdqa	%xmm1, 0x10(%edx)
361	movdqa	%xmm2, 0x20(%edx)
362	movdqa	%xmm3, 0x30(%edx)
363	movdqa	%xmm4, 0x40(%edx)
364	movdqa	%xmm5, 0x50(%edx)
365	movdqa	%xmm6, 0x60(%edx)
366	movdqa	%xmm7, 0x70(%edx)
367	lea	0x80(%edx), %edx
368
369	jae	L(shl_0_gobble_mem_loop)
370	cmp	$-0x40, %ecx
371	lea	0x80(%ecx), %ecx
372	jl	L(shl_0_mem_less_64bytes)
373
374	movdqa	(%eax), %xmm0
375	sub	$0x40, %ecx
376	movdqa	0x10(%eax), %xmm1
377
378	movdqa	%xmm0, (%edx)
379	movdqa	%xmm1, 0x10(%edx)
380
381	movdqa	0x20(%eax), %xmm0
382	movdqa	0x30(%eax), %xmm1
383	add	$0x40, %eax
384
385	movdqa	%xmm0, 0x20(%edx)
386	movdqa	%xmm1, 0x30(%edx)
387	add	$0x40, %edx
388
389L(shl_0_mem_less_64bytes):
390	cmp	$0x20, %ecx
391	jb	L(shl_0_mem_less_32bytes)
392	movdqa	(%eax), %xmm0
393	sub	$0x20, %ecx
394	movdqa	0x10(%eax), %xmm1
395	add	$0x20, %eax
396	movdqa	%xmm0, (%edx)
397	movdqa	%xmm1, 0x10(%edx)
398	add	$0x20, %edx
399
400L(shl_0_mem_less_32bytes):
401	cmp	$0x10, %ecx
402	jb	L(shl_0_mem_less_16bytes)
403	sub	$0x10, %ecx
404	movdqa	(%eax), %xmm0
405	add	$0x10, %eax
406	movdqa	%xmm0, (%edx)
407	add	$0x10, %edx
408
409L(shl_0_mem_less_16bytes):
410	add	%ecx, %edx
411	add	%ecx, %eax
412	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
413
414	.p2align 4
415L(shl_1):
416#ifndef USE_AS_MEMMOVE
417	movaps	-1(%eax), %xmm1
418#else
419	movl	DEST+4(%esp), %edi
420	movaps	-1(%eax), %xmm1
421	movdqu	%xmm0, (%edi)
422#endif
423#ifdef DATA_CACHE_SIZE_HALF
424	cmp	$DATA_CACHE_SIZE_HALF, %ecx
425#else
426# if (defined SHARED || defined __PIC__)
427	SETUP_PIC_REG(bx)
428	add	$_GLOBAL_OFFSET_TABLE_, %ebx
429	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
430# else
431	cmp	__x86_data_cache_size_half, %ecx
432# endif
433#endif
434	jb L(sh_1_no_prefetch)
435
436	lea	-64(%ecx), %ecx
437
438	.p2align 4
439L(Shl1LoopStart):
440	prefetcht0 0x1c0(%eax)
441	prefetcht0 0x1c0(%edx)
442	movaps	15(%eax), %xmm2
443	movaps	31(%eax), %xmm3
444	movaps	47(%eax), %xmm4
445	movaps	63(%eax), %xmm5
446	movaps	%xmm5, %xmm7
447	palignr	$1, %xmm4, %xmm5
448	palignr	$1, %xmm3, %xmm4
449	movaps	%xmm5, 48(%edx)
450	palignr	$1, %xmm2, %xmm3
451	lea	64(%eax), %eax
452	palignr	$1, %xmm1, %xmm2
453	movaps	%xmm4, 32(%edx)
454	movaps	%xmm3, 16(%edx)
455	movaps	%xmm7, %xmm1
456	movaps	%xmm2, (%edx)
457	lea	64(%edx), %edx
458	sub	$64, %ecx
459	ja	L(Shl1LoopStart)
460
461L(Shl1LoopLeave):
462	add	$32, %ecx
463	jle	L(shl_end_0)
464
465	movaps	15(%eax), %xmm2
466	movaps	31(%eax), %xmm3
467	palignr	$1, %xmm2, %xmm3
468	palignr	$1, %xmm1, %xmm2
469	movaps	%xmm2, (%edx)
470	movaps	%xmm3, 16(%edx)
471	lea	32(%edx, %ecx), %edx
472	lea	32(%eax, %ecx), %eax
473	POP (%edi)
474	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
475
476	CFI_PUSH (%edi)
477
478	.p2align 4
479L(sh_1_no_prefetch):
480	lea	-32(%ecx), %ecx
481	lea	-1(%eax), %eax
482	xor	%edi, %edi
483
484	.p2align 4
485L(sh_1_no_prefetch_loop):
486	movdqa	16(%eax, %edi), %xmm2
487	sub	$32, %ecx
488	movdqa	32(%eax, %edi), %xmm3
489	movdqa	%xmm3, %xmm4
490	palignr	$1, %xmm2, %xmm3
491	palignr	$1, %xmm1, %xmm2
492	lea	32(%edi), %edi
493	movdqa	%xmm2, -32(%edx, %edi)
494	movdqa	%xmm3, -16(%edx, %edi)
495	jb	L(sh_1_end_no_prefetch_loop)
496
497	movdqa	16(%eax, %edi), %xmm2
498	sub	$32, %ecx
499	movdqa	32(%eax, %edi), %xmm3
500	movdqa	%xmm3, %xmm1
501	palignr	$1, %xmm2, %xmm3
502	palignr	$1, %xmm4, %xmm2
503	lea	32(%edi), %edi
504	movdqa	%xmm2, -32(%edx, %edi)
505	movdqa	%xmm3, -16(%edx, %edi)
506	jae	L(sh_1_no_prefetch_loop)
507
508L(sh_1_end_no_prefetch_loop):
509	lea	32(%ecx), %ecx
510	add	%ecx, %edi
511	add	%edi, %edx
512	lea	1(%edi, %eax), %eax
513	POP	(%edi)
514	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
515
516	CFI_PUSH (%edi)
517
518	.p2align 4
519L(shl_2):
520#ifndef USE_AS_MEMMOVE
521	movaps	-2(%eax), %xmm1
522#else
523	movl	DEST+4(%esp), %edi
524	movaps	-2(%eax), %xmm1
525	movdqu	%xmm0, (%edi)
526#endif
527#ifdef DATA_CACHE_SIZE_HALF
528	cmp	$DATA_CACHE_SIZE_HALF, %ecx
529#else
530# if (defined SHARED || defined __PIC__)
531	SETUP_PIC_REG(bx)
532	add	$_GLOBAL_OFFSET_TABLE_, %ebx
533	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
534# else
535	cmp	__x86_data_cache_size_half, %ecx
536# endif
537#endif
538	jb L(sh_2_no_prefetch)
539
540	lea	-64(%ecx), %ecx
541
542	.p2align 4
543L(Shl2LoopStart):
544	prefetcht0 0x1c0(%eax)
545	prefetcht0 0x1c0(%edx)
546	movaps	14(%eax), %xmm2
547	movaps	30(%eax), %xmm3
548	movaps	46(%eax), %xmm4
549	movaps	62(%eax), %xmm5
550	movaps	%xmm5, %xmm7
551	palignr	$2, %xmm4, %xmm5
552	palignr	$2, %xmm3, %xmm4
553	movaps	%xmm5, 48(%edx)
554	palignr	$2, %xmm2, %xmm3
555	lea	64(%eax), %eax
556	palignr	$2, %xmm1, %xmm2
557	movaps	%xmm4, 32(%edx)
558	movaps	%xmm3, 16(%edx)
559	movaps	%xmm7, %xmm1
560	movaps	%xmm2, (%edx)
561	lea	64(%edx), %edx
562	sub	$64, %ecx
563	ja	L(Shl2LoopStart)
564
565L(Shl2LoopLeave):
566	add	$32, %ecx
567	jle	L(shl_end_0)
568
569	movaps	14(%eax), %xmm2
570	movaps	30(%eax), %xmm3
571	palignr	$2, %xmm2, %xmm3
572	palignr	$2, %xmm1, %xmm2
573	movaps	%xmm2, (%edx)
574	movaps	%xmm3, 16(%edx)
575	lea	32(%edx, %ecx), %edx
576	lea	32(%eax, %ecx), %eax
577	POP (%edi)
578	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
579
580	CFI_PUSH (%edi)
581
582	.p2align 4
583L(sh_2_no_prefetch):
584	lea	-32(%ecx), %ecx
585	lea	-2(%eax), %eax
586	xor	%edi, %edi
587
588	.p2align 4
589L(sh_2_no_prefetch_loop):
590	movdqa	16(%eax, %edi), %xmm2
591	sub	$32, %ecx
592	movdqa	32(%eax, %edi), %xmm3
593	movdqa	%xmm3, %xmm4
594	palignr	$2, %xmm2, %xmm3
595	palignr	$2, %xmm1, %xmm2
596	lea	32(%edi), %edi
597	movdqa	%xmm2, -32(%edx, %edi)
598	movdqa	%xmm3, -16(%edx, %edi)
599	jb	L(sh_2_end_no_prefetch_loop)
600
601	movdqa	16(%eax, %edi), %xmm2
602	sub	$32, %ecx
603	movdqa	32(%eax, %edi), %xmm3
604	movdqa	%xmm3, %xmm1
605	palignr	$2, %xmm2, %xmm3
606	palignr	$2, %xmm4, %xmm2
607	lea	32(%edi), %edi
608	movdqa	%xmm2, -32(%edx, %edi)
609	movdqa	%xmm3, -16(%edx, %edi)
610	jae	L(sh_2_no_prefetch_loop)
611
612L(sh_2_end_no_prefetch_loop):
613	lea	32(%ecx), %ecx
614	add	%ecx, %edi
615	add	%edi, %edx
616	lea	2(%edi, %eax), %eax
617	POP	(%edi)
618	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
619
620	CFI_PUSH (%edi)
621
622	.p2align 4
623L(shl_3):
624#ifndef USE_AS_MEMMOVE
625	movaps	-3(%eax), %xmm1
626#else
627	movl	DEST+4(%esp), %edi
628	movaps	-3(%eax), %xmm1
629	movdqu	%xmm0, (%edi)
630#endif
631#ifdef DATA_CACHE_SIZE_HALF
632	cmp	$DATA_CACHE_SIZE_HALF, %ecx
633#else
634# if (defined SHARED || defined __PIC__)
635	SETUP_PIC_REG(bx)
636	add	$_GLOBAL_OFFSET_TABLE_, %ebx
637	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
638# else
639	cmp	__x86_data_cache_size_half, %ecx
640# endif
641#endif
642	jb L(sh_3_no_prefetch)
643
644	lea	-64(%ecx), %ecx
645
646	.p2align 4
647L(Shl3LoopStart):
648	prefetcht0 0x1c0(%eax)
649	prefetcht0 0x1c0(%edx)
650	movaps	13(%eax), %xmm2
651	movaps	29(%eax), %xmm3
652	movaps	45(%eax), %xmm4
653	movaps	61(%eax), %xmm5
654	movaps	%xmm5, %xmm7
655	palignr	$3, %xmm4, %xmm5
656	palignr	$3, %xmm3, %xmm4
657	movaps	%xmm5, 48(%edx)
658	palignr	$3, %xmm2, %xmm3
659	lea	64(%eax), %eax
660	palignr	$3, %xmm1, %xmm2
661	movaps	%xmm4, 32(%edx)
662	movaps	%xmm3, 16(%edx)
663	movaps	%xmm7, %xmm1
664	movaps	%xmm2, (%edx)
665	lea	64(%edx), %edx
666	sub	$64, %ecx
667	ja	L(Shl3LoopStart)
668
669L(Shl3LoopLeave):
670	add	$32, %ecx
671	jle	L(shl_end_0)
672
673	movaps	13(%eax), %xmm2
674	movaps	29(%eax), %xmm3
675	palignr	$3, %xmm2, %xmm3
676	palignr	$3, %xmm1, %xmm2
677	movaps	%xmm2, (%edx)
678	movaps	%xmm3, 16(%edx)
679	lea	32(%edx, %ecx), %edx
680	lea	32(%eax, %ecx), %eax
681	POP (%edi)
682	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
683
684	CFI_PUSH (%edi)
685
686	.p2align 4
687L(sh_3_no_prefetch):
688	lea	-32(%ecx), %ecx
689	lea	-3(%eax), %eax
690	xor	%edi, %edi
691
692	.p2align 4
693L(sh_3_no_prefetch_loop):
694	movdqa	16(%eax, %edi), %xmm2
695	sub	$32, %ecx
696	movdqa	32(%eax, %edi), %xmm3
697	movdqa	%xmm3, %xmm4
698	palignr	$3, %xmm2, %xmm3
699	palignr	$3, %xmm1, %xmm2
700	lea	32(%edi), %edi
701	movdqa	%xmm2, -32(%edx, %edi)
702	movdqa	%xmm3, -16(%edx, %edi)
703
704	jb	L(sh_3_end_no_prefetch_loop)
705
706	movdqa	16(%eax, %edi), %xmm2
707	sub	$32, %ecx
708	movdqa	32(%eax, %edi), %xmm3
709	movdqa	%xmm3, %xmm1
710	palignr	$3, %xmm2, %xmm3
711	palignr	$3, %xmm4, %xmm2
712	lea	32(%edi), %edi
713	movdqa	%xmm2, -32(%edx, %edi)
714	movdqa	%xmm3, -16(%edx, %edi)
715
716	jae	L(sh_3_no_prefetch_loop)
717
718L(sh_3_end_no_prefetch_loop):
719	lea	32(%ecx), %ecx
720	add	%ecx, %edi
721	add	%edi, %edx
722	lea	3(%edi, %eax), %eax
723	POP	(%edi)
724	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
725
726	CFI_PUSH (%edi)
727
728	.p2align 4
729L(shl_4):
730#ifndef USE_AS_MEMMOVE
731	movaps	-4(%eax), %xmm1
732#else
733	movl	DEST+4(%esp), %edi
734	movaps	-4(%eax), %xmm1
735	movdqu	%xmm0, (%edi)
736#endif
737#ifdef DATA_CACHE_SIZE_HALF
738	cmp	$DATA_CACHE_SIZE_HALF, %ecx
739#else
740# if (defined SHARED || defined __PIC__)
741	SETUP_PIC_REG(bx)
742	add	$_GLOBAL_OFFSET_TABLE_, %ebx
743	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
744# else
745	cmp	__x86_data_cache_size_half, %ecx
746# endif
747#endif
748	jb L(sh_4_no_prefetch)
749
750	lea	-64(%ecx), %ecx
751
752	.p2align 4
753L(Shl4LoopStart):
754	prefetcht0 0x1c0(%eax)
755	prefetcht0 0x1c0(%edx)
756	movaps	12(%eax), %xmm2
757	movaps	28(%eax), %xmm3
758	movaps	44(%eax), %xmm4
759	movaps	60(%eax), %xmm5
760	movaps	%xmm5, %xmm7
761	palignr	$4, %xmm4, %xmm5
762	palignr	$4, %xmm3, %xmm4
763	movaps	%xmm5, 48(%edx)
764	palignr	$4, %xmm2, %xmm3
765	lea	64(%eax), %eax
766	palignr	$4, %xmm1, %xmm2
767	movaps	%xmm4, 32(%edx)
768	movaps	%xmm3, 16(%edx)
769	movaps	%xmm7, %xmm1
770	movaps	%xmm2, (%edx)
771	lea	64(%edx), %edx
772	sub	$64, %ecx
773	ja	L(Shl4LoopStart)
774
775L(Shl4LoopLeave):
776	add	$32, %ecx
777	jle	L(shl_end_0)
778
779	movaps	12(%eax), %xmm2
780	movaps	28(%eax), %xmm3
781	palignr	$4, %xmm2, %xmm3
782	palignr	$4, %xmm1, %xmm2
783	movaps	%xmm2, (%edx)
784	movaps	%xmm3, 16(%edx)
785	lea	32(%edx, %ecx), %edx
786	lea	32(%eax, %ecx), %eax
787	POP (%edi)
788	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
789
790	CFI_PUSH (%edi)
791
792	.p2align 4
793L(sh_4_no_prefetch):
794	lea	-32(%ecx), %ecx
795	lea	-4(%eax), %eax
796	xor	%edi, %edi
797
798	.p2align 4
799L(sh_4_no_prefetch_loop):
800	movdqa	16(%eax, %edi), %xmm2
801	sub	$32, %ecx
802	movdqa	32(%eax, %edi), %xmm3
803	movdqa	%xmm3, %xmm4
804	palignr	$4, %xmm2, %xmm3
805	palignr	$4, %xmm1, %xmm2
806	lea	32(%edi), %edi
807	movdqa	%xmm2, -32(%edx, %edi)
808	movdqa	%xmm3, -16(%edx, %edi)
809
810	jb	L(sh_4_end_no_prefetch_loop)
811
812	movdqa	16(%eax, %edi), %xmm2
813	sub	$32, %ecx
814	movdqa	32(%eax, %edi), %xmm3
815	movdqa	%xmm3, %xmm1
816	palignr	$4, %xmm2, %xmm3
817	palignr	$4, %xmm4, %xmm2
818	lea	32(%edi), %edi
819	movdqa	%xmm2, -32(%edx, %edi)
820	movdqa	%xmm3, -16(%edx, %edi)
821
822	jae	L(sh_4_no_prefetch_loop)
823
824L(sh_4_end_no_prefetch_loop):
825	lea	32(%ecx), %ecx
826	add	%ecx, %edi
827	add	%edi, %edx
828	lea	4(%edi, %eax), %eax
829	POP	(%edi)
830	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
831
832	CFI_PUSH (%edi)
833
834	.p2align 4
835L(shl_5):
836#ifndef USE_AS_MEMMOVE
837	movaps	-5(%eax), %xmm1
838#else
839	movl	DEST+4(%esp), %edi
840	movaps	-5(%eax), %xmm1
841	movdqu	%xmm0, (%edi)
842#endif
843#ifdef DATA_CACHE_SIZE_HALF
844	cmp	$DATA_CACHE_SIZE_HALF, %ecx
845#else
846# if (defined SHARED || defined __PIC__)
847	SETUP_PIC_REG(bx)
848	add	$_GLOBAL_OFFSET_TABLE_, %ebx
849	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
850# else
851	cmp	__x86_data_cache_size_half, %ecx
852# endif
853#endif
854	jb L(sh_5_no_prefetch)
855
856	lea	-64(%ecx), %ecx
857
858	.p2align 4
859L(Shl5LoopStart):
860	prefetcht0 0x1c0(%eax)
861	prefetcht0 0x1c0(%edx)
862	movaps	11(%eax), %xmm2
863	movaps	27(%eax), %xmm3
864	movaps	43(%eax), %xmm4
865	movaps	59(%eax), %xmm5
866	movaps	%xmm5, %xmm7
867	palignr	$5, %xmm4, %xmm5
868	palignr	$5, %xmm3, %xmm4
869	movaps	%xmm5, 48(%edx)
870	palignr	$5, %xmm2, %xmm3
871	lea	64(%eax), %eax
872	palignr	$5, %xmm1, %xmm2
873	movaps	%xmm4, 32(%edx)
874	movaps	%xmm3, 16(%edx)
875	movaps	%xmm7, %xmm1
876	movaps	%xmm2, (%edx)
877	lea	64(%edx), %edx
878	sub	$64, %ecx
879	ja	L(Shl5LoopStart)
880
881L(Shl5LoopLeave):
882	add	$32, %ecx
883	jle	L(shl_end_0)
884
885	movaps	11(%eax), %xmm2
886	movaps	27(%eax), %xmm3
887	palignr	$5, %xmm2, %xmm3
888	palignr	$5, %xmm1, %xmm2
889	movaps	%xmm2, (%edx)
890	movaps	%xmm3, 16(%edx)
891	lea	32(%edx, %ecx), %edx
892	lea	32(%eax, %ecx), %eax
893	POP (%edi)
894	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
895
896	CFI_PUSH (%edi)
897
898	.p2align 4
899L(sh_5_no_prefetch):
900	lea	-32(%ecx), %ecx
901	lea	-5(%eax), %eax
902	xor	%edi, %edi
903
904	.p2align 4
905L(sh_5_no_prefetch_loop):
906	movdqa	16(%eax, %edi), %xmm2
907	sub	$32, %ecx
908	movdqa	32(%eax, %edi), %xmm3
909	movdqa	%xmm3, %xmm4
910	palignr	$5, %xmm2, %xmm3
911	palignr	$5, %xmm1, %xmm2
912	lea	32(%edi), %edi
913	movdqa	%xmm2, -32(%edx, %edi)
914	movdqa	%xmm3, -16(%edx, %edi)
915
916	jb	L(sh_5_end_no_prefetch_loop)
917
918	movdqa	16(%eax, %edi), %xmm2
919	sub	$32, %ecx
920	movdqa	32(%eax, %edi), %xmm3
921	movdqa	%xmm3, %xmm1
922	palignr	$5, %xmm2, %xmm3
923	palignr	$5, %xmm4, %xmm2
924	lea	32(%edi), %edi
925	movdqa	%xmm2, -32(%edx, %edi)
926	movdqa	%xmm3, -16(%edx, %edi)
927
928	jae	L(sh_5_no_prefetch_loop)
929
930L(sh_5_end_no_prefetch_loop):
931	lea	32(%ecx), %ecx
932	add	%ecx, %edi
933	add	%edi, %edx
934	lea	5(%edi, %eax), %eax
935	POP	(%edi)
936	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
937
938	CFI_PUSH (%edi)
939
940	.p2align 4
941L(shl_6):
942#ifndef USE_AS_MEMMOVE
943	movaps	-6(%eax), %xmm1
944#else
945	movl	DEST+4(%esp), %edi
946	movaps	-6(%eax), %xmm1
947	movdqu	%xmm0, (%edi)
948#endif
949#ifdef DATA_CACHE_SIZE_HALF
950	cmp	$DATA_CACHE_SIZE_HALF, %ecx
951#else
952# if (defined SHARED || defined __PIC__)
953	SETUP_PIC_REG(bx)
954	add	$_GLOBAL_OFFSET_TABLE_, %ebx
955	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
956# else
957	cmp	__x86_data_cache_size_half, %ecx
958# endif
959#endif
960	jb L(sh_6_no_prefetch)
961
962	lea	-64(%ecx), %ecx
963
964	.p2align 4
965L(Shl6LoopStart):
966	prefetcht0 0x1c0(%eax)
967	prefetcht0 0x1c0(%edx)
968	movaps	10(%eax), %xmm2
969	movaps	26(%eax), %xmm3
970	movaps	42(%eax), %xmm4
971	movaps	58(%eax), %xmm5
972	movaps	%xmm5, %xmm7
973	palignr	$6, %xmm4, %xmm5
974	palignr	$6, %xmm3, %xmm4
975	movaps	%xmm5, 48(%edx)
976	palignr	$6, %xmm2, %xmm3
977	lea	64(%eax), %eax
978	palignr	$6, %xmm1, %xmm2
979	movaps	%xmm4, 32(%edx)
980	movaps	%xmm3, 16(%edx)
981	movaps	%xmm7, %xmm1
982	movaps	%xmm2, (%edx)
983	lea	64(%edx), %edx
984	sub	$64, %ecx
985	ja	L(Shl6LoopStart)
986
987L(Shl6LoopLeave):
988	add	$32, %ecx
989	jle	L(shl_end_0)
990
991	movaps	10(%eax), %xmm2
992	movaps	26(%eax), %xmm3
993	palignr	$6, %xmm2, %xmm3
994	palignr	$6, %xmm1, %xmm2
995	movaps	%xmm2, (%edx)
996	movaps	%xmm3, 16(%edx)
997	lea	32(%edx, %ecx), %edx
998	lea	32(%eax, %ecx), %eax
999	POP (%edi)
1000	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1001
1002	CFI_PUSH (%edi)
1003
1004	.p2align 4
1005L(sh_6_no_prefetch):
1006	lea	-32(%ecx), %ecx
1007	lea	-6(%eax), %eax
1008	xor	%edi, %edi
1009
1010	.p2align 4
1011L(sh_6_no_prefetch_loop):
1012	movdqa	16(%eax, %edi), %xmm2
1013	sub	$32, %ecx
1014	movdqa	32(%eax, %edi), %xmm3
1015	movdqa	%xmm3, %xmm4
1016	palignr	$6, %xmm2, %xmm3
1017	palignr	$6, %xmm1, %xmm2
1018	lea	32(%edi), %edi
1019	movdqa	%xmm2, -32(%edx, %edi)
1020	movdqa	%xmm3, -16(%edx, %edi)
1021
1022	jb	L(sh_6_end_no_prefetch_loop)
1023
1024	movdqa	16(%eax, %edi), %xmm2
1025	sub	$32, %ecx
1026	movdqa	32(%eax, %edi), %xmm3
1027	movdqa	%xmm3, %xmm1
1028	palignr	$6, %xmm2, %xmm3
1029	palignr	$6, %xmm4, %xmm2
1030	lea	32(%edi), %edi
1031	movdqa	%xmm2, -32(%edx, %edi)
1032	movdqa	%xmm3, -16(%edx, %edi)
1033
1034	jae	L(sh_6_no_prefetch_loop)
1035
1036L(sh_6_end_no_prefetch_loop):
1037	lea	32(%ecx), %ecx
1038	add	%ecx, %edi
1039	add	%edi, %edx
1040	lea	6(%edi, %eax), %eax
1041	POP	(%edi)
1042	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1043
1044	CFI_PUSH (%edi)
1045
1046	.p2align 4
1047L(shl_7):
1048#ifndef USE_AS_MEMMOVE
1049	movaps	-7(%eax), %xmm1
1050#else
1051	movl	DEST+4(%esp), %edi
1052	movaps	-7(%eax), %xmm1
1053	movdqu	%xmm0, (%edi)
1054#endif
1055#ifdef DATA_CACHE_SIZE_HALF
1056	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1057#else
1058# if (defined SHARED || defined __PIC__)
1059	SETUP_PIC_REG(bx)
1060	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1061	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1062# else
1063	cmp	__x86_data_cache_size_half, %ecx
1064# endif
1065#endif
1066	jb L(sh_7_no_prefetch)
1067
1068	lea	-64(%ecx), %ecx
1069
1070	.p2align 4
1071L(Shl7LoopStart):
1072	prefetcht0 0x1c0(%eax)
1073	prefetcht0 0x1c0(%edx)
1074	movaps	9(%eax), %xmm2
1075	movaps	25(%eax), %xmm3
1076	movaps	41(%eax), %xmm4
1077	movaps	57(%eax), %xmm5
1078	movaps	%xmm5, %xmm7
1079	palignr	$7, %xmm4, %xmm5
1080	palignr	$7, %xmm3, %xmm4
1081	movaps	%xmm5, 48(%edx)
1082	palignr	$7, %xmm2, %xmm3
1083	lea	64(%eax), %eax
1084	palignr	$7, %xmm1, %xmm2
1085	movaps	%xmm4, 32(%edx)
1086	movaps	%xmm3, 16(%edx)
1087	movaps	%xmm7, %xmm1
1088	movaps	%xmm2, (%edx)
1089	lea	64(%edx), %edx
1090	sub	$64, %ecx
1091	ja	L(Shl7LoopStart)
1092
1093L(Shl7LoopLeave):
1094	add	$32, %ecx
1095	jle	L(shl_end_0)
1096
1097	movaps	9(%eax), %xmm2
1098	movaps	25(%eax), %xmm3
1099	palignr	$7, %xmm2, %xmm3
1100	palignr	$7, %xmm1, %xmm2
1101	movaps	%xmm2, (%edx)
1102	movaps	%xmm3, 16(%edx)
1103	lea	32(%edx, %ecx), %edx
1104	lea	32(%eax, %ecx), %eax
1105	POP (%edi)
1106	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1107
1108	CFI_PUSH (%edi)
1109
1110	.p2align 4
1111L(sh_7_no_prefetch):
1112	lea	-32(%ecx), %ecx
1113	lea	-7(%eax), %eax
1114	xor	%edi, %edi
1115
1116	.p2align 4
1117L(sh_7_no_prefetch_loop):
1118	movdqa	16(%eax, %edi), %xmm2
1119	sub	$32, %ecx
1120	movdqa	32(%eax, %edi), %xmm3
1121	movdqa	%xmm3, %xmm4
1122	palignr	$7, %xmm2, %xmm3
1123	palignr	$7, %xmm1, %xmm2
1124	lea	32(%edi), %edi
1125	movdqa	%xmm2, -32(%edx, %edi)
1126	movdqa	%xmm3, -16(%edx, %edi)
1127	jb	L(sh_7_end_no_prefetch_loop)
1128
1129	movdqa	16(%eax, %edi), %xmm2
1130	sub	$32, %ecx
1131	movdqa	32(%eax, %edi), %xmm3
1132	movdqa	%xmm3, %xmm1
1133	palignr	$7, %xmm2, %xmm3
1134	palignr	$7, %xmm4, %xmm2
1135	lea	32(%edi), %edi
1136	movdqa	%xmm2, -32(%edx, %edi)
1137	movdqa	%xmm3, -16(%edx, %edi)
1138	jae	L(sh_7_no_prefetch_loop)
1139
1140L(sh_7_end_no_prefetch_loop):
1141	lea	32(%ecx), %ecx
1142	add	%ecx, %edi
1143	add	%edi, %edx
1144	lea	7(%edi, %eax), %eax
1145	POP	(%edi)
1146	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1147
1148	CFI_PUSH (%edi)
1149
1150	.p2align 4
1151L(shl_8):
1152#ifndef USE_AS_MEMMOVE
1153	movaps	-8(%eax), %xmm1
1154#else
1155	movl	DEST+4(%esp), %edi
1156	movaps	-8(%eax), %xmm1
1157	movdqu	%xmm0, (%edi)
1158#endif
1159#ifdef DATA_CACHE_SIZE_HALF
1160	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1161#else
1162# if (defined SHARED || defined __PIC__)
1163	SETUP_PIC_REG(bx)
1164	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1165	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1166# else
1167	cmp	__x86_data_cache_size_half, %ecx
1168# endif
1169#endif
1170	jb L(sh_8_no_prefetch)
1171
1172	lea	-64(%ecx), %ecx
1173
1174	.p2align 4
1175L(Shl8LoopStart):
1176	prefetcht0 0x1c0(%eax)
1177	prefetcht0 0x1c0(%edx)
1178	movaps	8(%eax), %xmm2
1179	movaps	24(%eax), %xmm3
1180	movaps	40(%eax), %xmm4
1181	movaps	56(%eax), %xmm5
1182	movaps	%xmm5, %xmm7
1183	palignr	$8, %xmm4, %xmm5
1184	palignr	$8, %xmm3, %xmm4
1185	movaps	%xmm5, 48(%edx)
1186	palignr	$8, %xmm2, %xmm3
1187	lea	64(%eax), %eax
1188	palignr	$8, %xmm1, %xmm2
1189	movaps	%xmm4, 32(%edx)
1190	movaps	%xmm3, 16(%edx)
1191	movaps	%xmm7, %xmm1
1192	movaps	%xmm2, (%edx)
1193	lea	64(%edx), %edx
1194	sub	$64, %ecx
1195	ja	L(Shl8LoopStart)
1196
1197L(LoopLeave8):
1198	add	$32, %ecx
1199	jle	L(shl_end_0)
1200
1201	movaps	8(%eax), %xmm2
1202	movaps	24(%eax), %xmm3
1203	palignr	$8, %xmm2, %xmm3
1204	palignr	$8, %xmm1, %xmm2
1205	movaps	%xmm2, (%edx)
1206	movaps	%xmm3, 16(%edx)
1207	lea	32(%edx, %ecx), %edx
1208	lea	32(%eax, %ecx), %eax
1209	POP (%edi)
1210	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1211
1212	CFI_PUSH (%edi)
1213
1214	.p2align 4
1215L(sh_8_no_prefetch):
1216	lea	-32(%ecx), %ecx
1217	lea	-8(%eax), %eax
1218	xor	%edi, %edi
1219
1220	.p2align 4
1221L(sh_8_no_prefetch_loop):
1222	movdqa	16(%eax, %edi), %xmm2
1223	sub	$32, %ecx
1224	movdqa	32(%eax, %edi), %xmm3
1225	movdqa	%xmm3, %xmm4
1226	palignr	$8, %xmm2, %xmm3
1227	palignr	$8, %xmm1, %xmm2
1228	lea	32(%edi), %edi
1229	movdqa	%xmm2, -32(%edx, %edi)
1230	movdqa	%xmm3, -16(%edx, %edi)
1231	jb	L(sh_8_end_no_prefetch_loop)
1232
1233	movdqa	16(%eax, %edi), %xmm2
1234	sub	$32, %ecx
1235	movdqa	32(%eax, %edi), %xmm3
1236	movdqa	%xmm3, %xmm1
1237	palignr	$8, %xmm2, %xmm3
1238	palignr	$8, %xmm4, %xmm2
1239	lea	32(%edi), %edi
1240	movdqa	%xmm2, -32(%edx, %edi)
1241	movdqa	%xmm3, -16(%edx, %edi)
1242	jae	L(sh_8_no_prefetch_loop)
1243
1244L(sh_8_end_no_prefetch_loop):
1245	lea	32(%ecx), %ecx
1246	add	%ecx, %edi
1247	add	%edi, %edx
1248	lea	8(%edi, %eax), %eax
1249	POP	(%edi)
1250	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1251
1252	CFI_PUSH (%edi)
1253
1254	.p2align 4
1255L(shl_9):
1256#ifndef USE_AS_MEMMOVE
1257	movaps	-9(%eax), %xmm1
1258#else
1259	movl	DEST+4(%esp), %edi
1260	movaps	-9(%eax), %xmm1
1261	movdqu	%xmm0, (%edi)
1262#endif
1263#ifdef DATA_CACHE_SIZE_HALF
1264	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1265#else
1266# if (defined SHARED || defined __PIC__)
1267	SETUP_PIC_REG(bx)
1268	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1269	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1270# else
1271	cmp	__x86_data_cache_size_half, %ecx
1272# endif
1273#endif
1274	jb L(sh_9_no_prefetch)
1275
1276	lea	-64(%ecx), %ecx
1277
1278	.p2align 4
1279L(Shl9LoopStart):
1280	prefetcht0 0x1c0(%eax)
1281	prefetcht0 0x1c0(%edx)
1282	movaps	7(%eax), %xmm2
1283	movaps	23(%eax), %xmm3
1284	movaps	39(%eax), %xmm4
1285	movaps	55(%eax), %xmm5
1286	movaps	%xmm5, %xmm7
1287	palignr	$9, %xmm4, %xmm5
1288	palignr	$9, %xmm3, %xmm4
1289	movaps	%xmm5, 48(%edx)
1290	palignr	$9, %xmm2, %xmm3
1291	lea	64(%eax), %eax
1292	palignr	$9, %xmm1, %xmm2
1293	movaps	%xmm4, 32(%edx)
1294	movaps	%xmm3, 16(%edx)
1295	movaps	%xmm7, %xmm1
1296	movaps	%xmm2, (%edx)
1297	lea	64(%edx), %edx
1298	sub	$64, %ecx
1299	ja	L(Shl9LoopStart)
1300
1301L(Shl9LoopLeave):
1302	add	$32, %ecx
1303	jle	L(shl_end_0)
1304
1305	movaps	7(%eax), %xmm2
1306	movaps	23(%eax), %xmm3
1307	palignr	$9, %xmm2, %xmm3
1308	palignr	$9, %xmm1, %xmm2
1309
1310	movaps	%xmm2, (%edx)
1311	movaps	%xmm3, 16(%edx)
1312	lea	32(%edx, %ecx), %edx
1313	lea	32(%eax, %ecx), %eax
1314	POP (%edi)
1315	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1316
1317	CFI_PUSH (%edi)
1318
1319	.p2align 4
1320L(sh_9_no_prefetch):
1321	lea	-32(%ecx), %ecx
1322	lea	-9(%eax), %eax
1323	xor	%edi, %edi
1324
1325	.p2align 4
1326L(sh_9_no_prefetch_loop):
1327	movdqa	16(%eax, %edi), %xmm2
1328	sub	$32, %ecx
1329	movdqa	32(%eax, %edi), %xmm3
1330	movdqa	%xmm3, %xmm4
1331	palignr	$9, %xmm2, %xmm3
1332	palignr	$9, %xmm1, %xmm2
1333	lea	32(%edi), %edi
1334	movdqa	%xmm2, -32(%edx, %edi)
1335	movdqa	%xmm3, -16(%edx, %edi)
1336	jb	L(sh_9_end_no_prefetch_loop)
1337
1338	movdqa	16(%eax, %edi), %xmm2
1339	sub	$32, %ecx
1340	movdqa	32(%eax, %edi), %xmm3
1341	movdqa	%xmm3, %xmm1
1342	palignr	$9, %xmm2, %xmm3
1343	palignr	$9, %xmm4, %xmm2
1344	lea	32(%edi), %edi
1345	movdqa	%xmm2, -32(%edx, %edi)
1346	movdqa	%xmm3, -16(%edx, %edi)
1347	jae	L(sh_9_no_prefetch_loop)
1348
1349L(sh_9_end_no_prefetch_loop):
1350	lea	32(%ecx), %ecx
1351	add	%ecx, %edi
1352	add	%edi, %edx
1353	lea	9(%edi, %eax), %eax
1354	POP	(%edi)
1355	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1356
1357	CFI_PUSH (%edi)
1358
1359	.p2align 4
1360L(shl_10):
1361#ifndef USE_AS_MEMMOVE
1362	movaps	-10(%eax), %xmm1
1363#else
1364	movl	DEST+4(%esp), %edi
1365	movaps	-10(%eax), %xmm1
1366	movdqu	%xmm0, (%edi)
1367#endif
1368#ifdef DATA_CACHE_SIZE_HALF
1369	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1370#else
1371# if (defined SHARED || defined __PIC__)
1372	SETUP_PIC_REG(bx)
1373	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1374	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1375# else
1376	cmp	__x86_data_cache_size_half, %ecx
1377# endif
1378#endif
1379	jb L(sh_10_no_prefetch)
1380
1381	lea	-64(%ecx), %ecx
1382
1383	.p2align 4
1384L(Shl10LoopStart):
1385	prefetcht0 0x1c0(%eax)
1386	prefetcht0 0x1c0(%edx)
1387	movaps	6(%eax), %xmm2
1388	movaps	22(%eax), %xmm3
1389	movaps	38(%eax), %xmm4
1390	movaps	54(%eax), %xmm5
1391	movaps	%xmm5, %xmm7
1392	palignr	$10, %xmm4, %xmm5
1393	palignr	$10, %xmm3, %xmm4
1394	movaps	%xmm5, 48(%edx)
1395	palignr	$10, %xmm2, %xmm3
1396	lea	64(%eax), %eax
1397	palignr	$10, %xmm1, %xmm2
1398	movaps	%xmm4, 32(%edx)
1399	movaps	%xmm3, 16(%edx)
1400	movaps	%xmm7, %xmm1
1401	movaps	%xmm2, (%edx)
1402	lea	64(%edx), %edx
1403	sub	$64, %ecx
1404	ja	L(Shl10LoopStart)
1405
1406L(Shl10LoopLeave):
1407	add	$32, %ecx
1408	jle	L(shl_end_0)
1409
1410	movaps	6(%eax), %xmm2
1411	movaps	22(%eax), %xmm3
1412	palignr	$10, %xmm2, %xmm3
1413	palignr	$10, %xmm1, %xmm2
1414
1415	movaps	%xmm2, (%edx)
1416	movaps	%xmm3, 16(%edx)
1417	lea	32(%edx, %ecx), %edx
1418	lea	32(%eax, %ecx), %eax
1419	POP (%edi)
1420	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1421
1422	CFI_PUSH (%edi)
1423
1424	.p2align 4
1425L(sh_10_no_prefetch):
1426	lea	-32(%ecx), %ecx
1427	lea	-10(%eax), %eax
1428	xor	%edi, %edi
1429
1430	.p2align 4
1431L(sh_10_no_prefetch_loop):
1432	movdqa	16(%eax, %edi), %xmm2
1433	sub	$32, %ecx
1434	movdqa	32(%eax, %edi), %xmm3
1435	movdqa	%xmm3, %xmm4
1436	palignr	$10, %xmm2, %xmm3
1437	palignr	$10, %xmm1, %xmm2
1438	lea	32(%edi), %edi
1439	movdqa	%xmm2, -32(%edx, %edi)
1440	movdqa	%xmm3, -16(%edx, %edi)
1441	jb	L(sh_10_end_no_prefetch_loop)
1442
1443	movdqa	16(%eax, %edi), %xmm2
1444	sub	$32, %ecx
1445	movdqa	32(%eax, %edi), %xmm3
1446	movdqa	%xmm3, %xmm1
1447	palignr	$10, %xmm2, %xmm3
1448	palignr	$10, %xmm4, %xmm2
1449	lea	32(%edi), %edi
1450	movdqa	%xmm2, -32(%edx, %edi)
1451	movdqa	%xmm3, -16(%edx, %edi)
1452	jae	L(sh_10_no_prefetch_loop)
1453
1454L(sh_10_end_no_prefetch_loop):
1455	lea	32(%ecx), %ecx
1456	add	%ecx, %edi
1457	add	%edi, %edx
1458	lea	10(%edi, %eax), %eax
1459	POP	(%edi)
1460	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1461
1462	CFI_PUSH (%edi)
1463
1464	.p2align 4
1465L(shl_11):
1466#ifndef USE_AS_MEMMOVE
1467	movaps	-11(%eax), %xmm1
1468#else
1469	movl	DEST+4(%esp), %edi
1470	movaps	-11(%eax), %xmm1
1471	movdqu	%xmm0, (%edi)
1472#endif
1473#ifdef DATA_CACHE_SIZE_HALF
1474	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1475#else
1476# if (defined SHARED || defined __PIC__)
1477	SETUP_PIC_REG(bx)
1478	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1479	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1480# else
1481	cmp	__x86_data_cache_size_half, %ecx
1482# endif
1483#endif
1484	jb L(sh_11_no_prefetch)
1485
1486	lea	-64(%ecx), %ecx
1487
1488	.p2align 4
1489L(Shl11LoopStart):
1490	prefetcht0 0x1c0(%eax)
1491	prefetcht0 0x1c0(%edx)
1492	movaps	5(%eax), %xmm2
1493	movaps	21(%eax), %xmm3
1494	movaps	37(%eax), %xmm4
1495	movaps	53(%eax), %xmm5
1496	movaps	%xmm5, %xmm7
1497	palignr	$11, %xmm4, %xmm5
1498	palignr	$11, %xmm3, %xmm4
1499	movaps	%xmm5, 48(%edx)
1500	palignr	$11, %xmm2, %xmm3
1501	lea	64(%eax), %eax
1502	palignr	$11, %xmm1, %xmm2
1503	movaps	%xmm4, 32(%edx)
1504	movaps	%xmm3, 16(%edx)
1505	movaps	%xmm7, %xmm1
1506	movaps	%xmm2, (%edx)
1507	lea	64(%edx), %edx
1508	sub	$64, %ecx
1509	ja	L(Shl11LoopStart)
1510
1511L(Shl11LoopLeave):
1512	add	$32, %ecx
1513	jle	L(shl_end_0)
1514
1515	movaps	5(%eax), %xmm2
1516	movaps	21(%eax), %xmm3
1517	palignr	$11, %xmm2, %xmm3
1518	palignr	$11, %xmm1, %xmm2
1519
1520	movaps	%xmm2, (%edx)
1521	movaps	%xmm3, 16(%edx)
1522	lea	32(%edx, %ecx), %edx
1523	lea	32(%eax, %ecx), %eax
1524	POP (%edi)
1525	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1526
1527	CFI_PUSH (%edi)
1528
1529	.p2align 4
1530L(sh_11_no_prefetch):
1531	lea	-32(%ecx), %ecx
1532	lea	-11(%eax), %eax
1533	xor	%edi, %edi
1534
1535	.p2align 4
1536L(sh_11_no_prefetch_loop):
1537	movdqa	16(%eax, %edi), %xmm2
1538	sub	$32, %ecx
1539	movdqa	32(%eax, %edi), %xmm3
1540	movdqa	%xmm3, %xmm4
1541	palignr	$11, %xmm2, %xmm3
1542	palignr	$11, %xmm1, %xmm2
1543	lea	32(%edi), %edi
1544	movdqa	%xmm2, -32(%edx, %edi)
1545	movdqa	%xmm3, -16(%edx, %edi)
1546	jb	L(sh_11_end_no_prefetch_loop)
1547
1548	movdqa	16(%eax, %edi), %xmm2
1549	sub	$32, %ecx
1550	movdqa	32(%eax, %edi), %xmm3
1551	movdqa	%xmm3, %xmm1
1552	palignr	$11, %xmm2, %xmm3
1553	palignr	$11, %xmm4, %xmm2
1554	lea	32(%edi), %edi
1555	movdqa	%xmm2, -32(%edx, %edi)
1556	movdqa	%xmm3, -16(%edx, %edi)
1557	jae	L(sh_11_no_prefetch_loop)
1558
1559L(sh_11_end_no_prefetch_loop):
1560	lea	32(%ecx), %ecx
1561	add	%ecx, %edi
1562	add	%edi, %edx
1563	lea	11(%edi, %eax), %eax
1564	POP	(%edi)
1565	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1566
1567	CFI_PUSH (%edi)
1568
1569	.p2align 4
1570L(shl_12):
1571#ifndef USE_AS_MEMMOVE
1572	movaps	-12(%eax), %xmm1
1573#else
1574	movl	DEST+4(%esp), %edi
1575	movaps	-12(%eax), %xmm1
1576	movdqu	%xmm0, (%edi)
1577#endif
1578#ifdef DATA_CACHE_SIZE_HALF
1579	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1580#else
1581# if (defined SHARED || defined __PIC__)
1582	SETUP_PIC_REG(bx)
1583	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1584	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1585# else
1586	cmp	__x86_data_cache_size_half, %ecx
1587# endif
1588#endif
1589	jb L(sh_12_no_prefetch)
1590
1591	lea	-64(%ecx), %ecx
1592
1593	.p2align 4
1594L(Shl12LoopStart):
1595	prefetcht0 0x1c0(%eax)
1596	prefetcht0 0x1c0(%edx)
1597	movaps	4(%eax), %xmm2
1598	movaps	20(%eax), %xmm3
1599	movaps	36(%eax), %xmm4
1600	movaps	52(%eax), %xmm5
1601	movaps	%xmm5, %xmm7
1602	palignr	$12, %xmm4, %xmm5
1603	palignr	$12, %xmm3, %xmm4
1604	movaps	%xmm5, 48(%edx)
1605	palignr	$12, %xmm2, %xmm3
1606	lea	64(%eax), %eax
1607	palignr	$12, %xmm1, %xmm2
1608	movaps	%xmm4, 32(%edx)
1609	movaps	%xmm3, 16(%edx)
1610	movaps	%xmm7, %xmm1
1611	movaps	%xmm2, (%edx)
1612	lea	64(%edx), %edx
1613	sub	$64, %ecx
1614	ja	L(Shl12LoopStart)
1615
1616L(Shl12LoopLeave):
1617	add	$32, %ecx
1618	jle	L(shl_end_0)
1619
1620	movaps	4(%eax), %xmm2
1621	movaps	20(%eax), %xmm3
1622	palignr	$12, %xmm2, %xmm3
1623	palignr	$12, %xmm1, %xmm2
1624
1625	movaps	%xmm2, (%edx)
1626	movaps	%xmm3, 16(%edx)
1627	lea	32(%edx, %ecx), %edx
1628	lea	32(%eax, %ecx), %eax
1629	POP (%edi)
1630	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1631
1632	CFI_PUSH (%edi)
1633
1634	.p2align 4
1635L(sh_12_no_prefetch):
1636	lea	-32(%ecx), %ecx
1637	lea	-12(%eax), %eax
1638	xor	%edi, %edi
1639
1640	.p2align 4
1641L(sh_12_no_prefetch_loop):
1642	movdqa	16(%eax, %edi), %xmm2
1643	sub	$32, %ecx
1644	movdqa	32(%eax, %edi), %xmm3
1645	movdqa	%xmm3, %xmm4
1646	palignr	$12, %xmm2, %xmm3
1647	palignr	$12, %xmm1, %xmm2
1648	lea	32(%edi), %edi
1649	movdqa	%xmm2, -32(%edx, %edi)
1650	movdqa	%xmm3, -16(%edx, %edi)
1651	jb	L(sh_12_end_no_prefetch_loop)
1652
1653	movdqa	16(%eax, %edi), %xmm2
1654	sub	$32, %ecx
1655	movdqa	32(%eax, %edi), %xmm3
1656	movdqa	%xmm3, %xmm1
1657	palignr	$12, %xmm2, %xmm3
1658	palignr	$12, %xmm4, %xmm2
1659	lea	32(%edi), %edi
1660	movdqa	%xmm2, -32(%edx, %edi)
1661	movdqa	%xmm3, -16(%edx, %edi)
1662	jae	L(sh_12_no_prefetch_loop)
1663
1664L(sh_12_end_no_prefetch_loop):
1665	lea	32(%ecx), %ecx
1666	add	%ecx, %edi
1667	add	%edi, %edx
1668	lea	12(%edi, %eax), %eax
1669	POP	(%edi)
1670	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1671
1672	CFI_PUSH (%edi)
1673
1674	.p2align 4
1675L(shl_13):
1676#ifndef USE_AS_MEMMOVE
1677	movaps	-13(%eax), %xmm1
1678#else
1679	movl	DEST+4(%esp), %edi
1680	movaps	-13(%eax), %xmm1
1681	movdqu	%xmm0, (%edi)
1682#endif
1683#ifdef DATA_CACHE_SIZE_HALF
1684	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1685#else
1686# if (defined SHARED || defined __PIC__)
1687	SETUP_PIC_REG(bx)
1688	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1689	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1690# else
1691	cmp	__x86_data_cache_size_half, %ecx
1692# endif
1693#endif
1694	jb L(sh_13_no_prefetch)
1695
1696	lea	-64(%ecx), %ecx
1697
1698	.p2align 4
1699L(Shl13LoopStart):
1700	prefetcht0 0x1c0(%eax)
1701	prefetcht0 0x1c0(%edx)
1702	movaps	3(%eax), %xmm2
1703	movaps	19(%eax), %xmm3
1704	movaps	35(%eax), %xmm4
1705	movaps	51(%eax), %xmm5
1706	movaps	%xmm5, %xmm7
1707	palignr	$13, %xmm4, %xmm5
1708	palignr	$13, %xmm3, %xmm4
1709	movaps	%xmm5, 48(%edx)
1710	palignr	$13, %xmm2, %xmm3
1711	lea	64(%eax), %eax
1712	palignr	$13, %xmm1, %xmm2
1713	movaps	%xmm4, 32(%edx)
1714	movaps	%xmm3, 16(%edx)
1715	movaps	%xmm7, %xmm1
1716	movaps	%xmm2, (%edx)
1717	lea	64(%edx), %edx
1718	sub	$64, %ecx
1719	ja	L(Shl13LoopStart)
1720
1721L(Shl13LoopLeave):
1722	add	$32, %ecx
1723	jle	L(shl_end_0)
1724
1725	movaps	3(%eax), %xmm2
1726	movaps	19(%eax), %xmm3
1727	palignr	$13, %xmm2, %xmm3
1728	palignr	$13, %xmm1, %xmm2
1729
1730	movaps	%xmm2, (%edx)
1731	movaps	%xmm3, 16(%edx)
1732	lea	32(%edx, %ecx), %edx
1733	lea	32(%eax, %ecx), %eax
1734	POP (%edi)
1735	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1736
1737	CFI_PUSH (%edi)
1738
1739	.p2align 4
1740L(sh_13_no_prefetch):
1741	lea	-32(%ecx), %ecx
1742	lea	-13(%eax), %eax
1743	xor	%edi, %edi
1744
1745	.p2align 4
1746L(sh_13_no_prefetch_loop):
1747	movdqa	16(%eax, %edi), %xmm2
1748	sub	$32, %ecx
1749	movdqa	32(%eax, %edi), %xmm3
1750	movdqa	%xmm3, %xmm4
1751	palignr	$13, %xmm2, %xmm3
1752	palignr	$13, %xmm1, %xmm2
1753	lea	32(%edi), %edi
1754	movdqa	%xmm2, -32(%edx, %edi)
1755	movdqa	%xmm3, -16(%edx, %edi)
1756	jb	L(sh_13_end_no_prefetch_loop)
1757
1758	movdqa	16(%eax, %edi), %xmm2
1759	sub	$32, %ecx
1760	movdqa	32(%eax, %edi), %xmm3
1761	movdqa	%xmm3, %xmm1
1762	palignr	$13, %xmm2, %xmm3
1763	palignr	$13, %xmm4, %xmm2
1764	lea	32(%edi), %edi
1765	movdqa	%xmm2, -32(%edx, %edi)
1766	movdqa	%xmm3, -16(%edx, %edi)
1767	jae	L(sh_13_no_prefetch_loop)
1768
1769L(sh_13_end_no_prefetch_loop):
1770	lea	32(%ecx), %ecx
1771	add	%ecx, %edi
1772	add	%edi, %edx
1773	lea	13(%edi, %eax), %eax
1774	POP	(%edi)
1775	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1776
1777	CFI_PUSH (%edi)
1778
1779	.p2align 4
1780L(shl_14):
1781#ifndef USE_AS_MEMMOVE
1782	movaps	-14(%eax), %xmm1
1783#else
1784	movl	DEST+4(%esp), %edi
1785	movaps	-14(%eax), %xmm1
1786	movdqu	%xmm0, (%edi)
1787#endif
1788#ifdef DATA_CACHE_SIZE_HALF
1789	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1790#else
1791# if (defined SHARED || defined __PIC__)
1792	SETUP_PIC_REG(bx)
1793	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1794	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1795# else
1796	cmp	__x86_data_cache_size_half, %ecx
1797# endif
1798#endif
1799	jb L(sh_14_no_prefetch)
1800
1801	lea	-64(%ecx), %ecx
1802
1803	.p2align 4
1804L(Shl14LoopStart):
1805	prefetcht0 0x1c0(%eax)
1806	prefetcht0 0x1c0(%edx)
1807	movaps	2(%eax), %xmm2
1808	movaps	18(%eax), %xmm3
1809	movaps	34(%eax), %xmm4
1810	movaps	50(%eax), %xmm5
1811	movaps	%xmm5, %xmm7
1812	palignr	$14, %xmm4, %xmm5
1813	palignr	$14, %xmm3, %xmm4
1814	movaps	%xmm5, 48(%edx)
1815	palignr	$14, %xmm2, %xmm3
1816	lea	64(%eax), %eax
1817	palignr	$14, %xmm1, %xmm2
1818	movaps	%xmm4, 32(%edx)
1819	movaps	%xmm3, 16(%edx)
1820	movaps	%xmm7, %xmm1
1821	movaps	%xmm2, (%edx)
1822	lea	64(%edx), %edx
1823	sub	$64, %ecx
1824	ja	L(Shl14LoopStart)
1825
1826L(Shl14LoopLeave):
1827	add	$32, %ecx
1828	jle	L(shl_end_0)
1829
1830	movaps	2(%eax), %xmm2
1831	movaps	18(%eax), %xmm3
1832	palignr	$14, %xmm2, %xmm3
1833	palignr	$14, %xmm1, %xmm2
1834
1835	movaps	%xmm2, (%edx)
1836	movaps	%xmm3, 16(%edx)
1837	lea	32(%edx, %ecx), %edx
1838	lea	32(%eax, %ecx), %eax
1839	POP (%edi)
1840	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1841
1842	CFI_PUSH (%edi)
1843
1844	.p2align 4
1845L(sh_14_no_prefetch):
1846	lea	-32(%ecx), %ecx
1847	lea	-14(%eax), %eax
1848	xor	%edi, %edi
1849
1850	.p2align 4
1851L(sh_14_no_prefetch_loop):
1852	movdqa	16(%eax, %edi), %xmm2
1853	sub	$32, %ecx
1854	movdqa	32(%eax, %edi), %xmm3
1855	movdqa	%xmm3, %xmm4
1856	palignr	$14, %xmm2, %xmm3
1857	palignr	$14, %xmm1, %xmm2
1858	lea	32(%edi), %edi
1859	movdqa	%xmm2, -32(%edx, %edi)
1860	movdqa	%xmm3, -16(%edx, %edi)
1861	jb	L(sh_14_end_no_prefetch_loop)
1862
1863	movdqa	16(%eax, %edi), %xmm2
1864	sub	$32, %ecx
1865	movdqa	32(%eax, %edi), %xmm3
1866	movdqa	%xmm3, %xmm1
1867	palignr	$14, %xmm2, %xmm3
1868	palignr	$14, %xmm4, %xmm2
1869	lea	32(%edi), %edi
1870	movdqa	%xmm2, -32(%edx, %edi)
1871	movdqa	%xmm3, -16(%edx, %edi)
1872	jae	L(sh_14_no_prefetch_loop)
1873
1874L(sh_14_end_no_prefetch_loop):
1875	lea	32(%ecx), %ecx
1876	add	%ecx, %edi
1877	add	%edi, %edx
1878	lea	14(%edi, %eax), %eax
1879	POP	(%edi)
1880	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1881
1882	CFI_PUSH (%edi)
1883
1884	.p2align 4
1885L(shl_15):
1886#ifndef USE_AS_MEMMOVE
1887	movaps	-15(%eax), %xmm1
1888#else
1889	movl	DEST+4(%esp), %edi
1890	movaps	-15(%eax), %xmm1
1891	movdqu	%xmm0, (%edi)
1892#endif
1893#ifdef DATA_CACHE_SIZE_HALF
1894	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1895#else
1896# if (defined SHARED || defined __PIC__)
1897	SETUP_PIC_REG(bx)
1898	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1899	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1900# else
1901	cmp	__x86_data_cache_size_half, %ecx
1902# endif
1903#endif
1904	jb L(sh_15_no_prefetch)
1905
1906	lea	-64(%ecx), %ecx
1907
1908	.p2align 4
1909L(Shl15LoopStart):
1910	prefetcht0 0x1c0(%eax)
1911	prefetcht0 0x1c0(%edx)
1912	movaps	1(%eax), %xmm2
1913	movaps	17(%eax), %xmm3
1914	movaps	33(%eax), %xmm4
1915	movaps	49(%eax), %xmm5
1916	movaps	%xmm5, %xmm7
1917	palignr	$15, %xmm4, %xmm5
1918	palignr	$15, %xmm3, %xmm4
1919	movaps	%xmm5, 48(%edx)
1920	palignr	$15, %xmm2, %xmm3
1921	lea	64(%eax), %eax
1922	palignr	$15, %xmm1, %xmm2
1923	movaps	%xmm4, 32(%edx)
1924	movaps	%xmm3, 16(%edx)
1925	movaps	%xmm7, %xmm1
1926	movaps	%xmm2, (%edx)
1927	lea	64(%edx), %edx
1928	sub	$64, %ecx
1929	ja	L(Shl15LoopStart)
1930
1931L(Shl15LoopLeave):
1932	add	$32, %ecx
1933	jle	L(shl_end_0)
1934
1935	movaps	1(%eax), %xmm2
1936	movaps	17(%eax), %xmm3
1937	palignr	$15, %xmm2, %xmm3
1938	palignr	$15, %xmm1, %xmm2
1939
1940	movaps	%xmm2, (%edx)
1941	movaps	%xmm3, 16(%edx)
1942	lea	32(%edx, %ecx), %edx
1943	lea	32(%eax, %ecx), %eax
1944	POP (%edi)
1945	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1946
1947	CFI_PUSH (%edi)
1948
1949	.p2align 4
1950L(sh_15_no_prefetch):
1951	lea	-32(%ecx), %ecx
1952	lea	-15(%eax), %eax
1953	xor	%edi, %edi
1954
1955	.p2align 4
1956L(sh_15_no_prefetch_loop):
1957	movdqa	16(%eax, %edi), %xmm2
1958	sub	$32, %ecx
1959	movdqa	32(%eax, %edi), %xmm3
1960	movdqa	%xmm3, %xmm4
1961	palignr	$15, %xmm2, %xmm3
1962	palignr	$15, %xmm1, %xmm2
1963	lea	32(%edi), %edi
1964	movdqa	%xmm2, -32(%edx, %edi)
1965	movdqa	%xmm3, -16(%edx, %edi)
1966	jb	L(sh_15_end_no_prefetch_loop)
1967
1968	movdqa	16(%eax, %edi), %xmm2
1969	sub	$32, %ecx
1970	movdqa	32(%eax, %edi), %xmm3
1971	movdqa	%xmm3, %xmm1
1972	palignr	$15, %xmm2, %xmm3
1973	palignr	$15, %xmm4, %xmm2
1974	lea	32(%edi), %edi
1975	movdqa	%xmm2, -32(%edx, %edi)
1976	movdqa	%xmm3, -16(%edx, %edi)
1977	jae	L(sh_15_no_prefetch_loop)
1978
1979L(sh_15_end_no_prefetch_loop):
1980	lea	32(%ecx), %ecx
1981	add	%ecx, %edi
1982	add	%edi, %edx
1983	lea	15(%edi, %eax), %eax
1984	POP	(%edi)
1985	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1986
1987	CFI_PUSH (%edi)
1988
1989	.p2align 4
1990L(shl_end_0):
1991	lea	32(%ecx), %ecx
1992	lea	(%edx, %ecx), %edx
1993	lea	(%eax, %ecx), %eax
1994	POP	(%edi)
1995	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1996
1997	.p2align 4
1998L(fwd_write_44bytes):
1999	movq	-44(%eax), %xmm0
2000	movq	%xmm0, -44(%edx)
2001L(fwd_write_36bytes):
2002	movq	-36(%eax), %xmm0
2003	movq	%xmm0, -36(%edx)
2004L(fwd_write_28bytes):
2005	movq	-28(%eax), %xmm0
2006	movq	%xmm0, -28(%edx)
2007L(fwd_write_20bytes):
2008	movq	-20(%eax), %xmm0
2009	movq	%xmm0, -20(%edx)
2010L(fwd_write_12bytes):
2011	movq	-12(%eax), %xmm0
2012	movq	%xmm0, -12(%edx)
2013L(fwd_write_4bytes):
2014	movl	-4(%eax), %ecx
2015	movl	%ecx, -4(%edx)
2016#ifdef USE_AS_MEMPCPY
2017	movl	%edx, %eax
2018#else
2019	movl	DEST(%esp), %eax
2020#endif
2021	RETURN
2022
2023	.p2align 4
2024L(fwd_write_40bytes):
2025	movq	-40(%eax), %xmm0
2026	movq	%xmm0, -40(%edx)
2027L(fwd_write_32bytes):
2028	movq	-32(%eax), %xmm0
2029	movq	%xmm0, -32(%edx)
2030L(fwd_write_24bytes):
2031	movq	-24(%eax), %xmm0
2032	movq	%xmm0, -24(%edx)
2033L(fwd_write_16bytes):
2034	movq	-16(%eax), %xmm0
2035	movq	%xmm0, -16(%edx)
2036L(fwd_write_8bytes):
2037	movq	-8(%eax), %xmm0
2038	movq	%xmm0, -8(%edx)
2039L(fwd_write_0bytes):
2040#ifdef USE_AS_MEMPCPY
2041	movl	%edx, %eax
2042#else
2043	movl	DEST(%esp), %eax
2044#endif
2045	RETURN
2046
2047	.p2align 4
2048L(fwd_write_5bytes):
2049	movl	-5(%eax), %ecx
2050	movl	-4(%eax), %eax
2051	movl	%ecx, -5(%edx)
2052	movl	%eax, -4(%edx)
2053#ifdef USE_AS_MEMPCPY
2054	movl	%edx, %eax
2055#else
2056	movl	DEST(%esp), %eax
2057#endif
2058	RETURN
2059
2060	.p2align 4
2061L(fwd_write_45bytes):
2062	movq	-45(%eax), %xmm0
2063	movq	%xmm0, -45(%edx)
2064L(fwd_write_37bytes):
2065	movq	-37(%eax), %xmm0
2066	movq	%xmm0, -37(%edx)
2067L(fwd_write_29bytes):
2068	movq	-29(%eax), %xmm0
2069	movq	%xmm0, -29(%edx)
2070L(fwd_write_21bytes):
2071	movq	-21(%eax), %xmm0
2072	movq	%xmm0, -21(%edx)
2073L(fwd_write_13bytes):
2074	movq	-13(%eax), %xmm0
2075	movq	%xmm0, -13(%edx)
2076	movl	-5(%eax), %ecx
2077	movl	%ecx, -5(%edx)
2078	movzbl	-1(%eax), %ecx
2079	movb	%cl, -1(%edx)
2080#ifdef USE_AS_MEMPCPY
2081	movl	%edx, %eax
2082#else
2083	movl	DEST(%esp), %eax
2084#endif
2085	RETURN
2086
2087	.p2align 4
2088L(fwd_write_41bytes):
2089	movq	-41(%eax), %xmm0
2090	movq	%xmm0, -41(%edx)
2091L(fwd_write_33bytes):
2092	movq	-33(%eax), %xmm0
2093	movq	%xmm0, -33(%edx)
2094L(fwd_write_25bytes):
2095	movq	-25(%eax), %xmm0
2096	movq	%xmm0, -25(%edx)
2097L(fwd_write_17bytes):
2098	movq	-17(%eax), %xmm0
2099	movq	%xmm0, -17(%edx)
2100L(fwd_write_9bytes):
2101	movq	-9(%eax), %xmm0
2102	movq	%xmm0, -9(%edx)
2103L(fwd_write_1bytes):
2104	movzbl	-1(%eax), %ecx
2105	movb	%cl, -1(%edx)
2106#ifdef USE_AS_MEMPCPY
2107	movl	%edx, %eax
2108#else
2109	movl	DEST(%esp), %eax
2110#endif
2111	RETURN
2112
2113	.p2align 4
2114L(fwd_write_46bytes):
2115	movq	-46(%eax), %xmm0
2116	movq	%xmm0, -46(%edx)
2117L(fwd_write_38bytes):
2118	movq	-38(%eax), %xmm0
2119	movq	%xmm0, -38(%edx)
2120L(fwd_write_30bytes):
2121	movq	-30(%eax), %xmm0
2122	movq	%xmm0, -30(%edx)
2123L(fwd_write_22bytes):
2124	movq	-22(%eax), %xmm0
2125	movq	%xmm0, -22(%edx)
2126L(fwd_write_14bytes):
2127	movq	-14(%eax), %xmm0
2128	movq	%xmm0, -14(%edx)
2129L(fwd_write_6bytes):
2130	movl	-6(%eax), %ecx
2131	movl	%ecx, -6(%edx)
2132	movzwl	-2(%eax), %ecx
2133	movw	%cx, -2(%edx)
2134#ifdef USE_AS_MEMPCPY
2135	movl	%edx, %eax
2136#else
2137	movl	DEST(%esp), %eax
2138#endif
2139	RETURN
2140
2141	.p2align 4
2142L(fwd_write_42bytes):
2143	movq	-42(%eax), %xmm0
2144	movq	%xmm0, -42(%edx)
2145L(fwd_write_34bytes):
2146	movq	-34(%eax), %xmm0
2147	movq	%xmm0, -34(%edx)
2148L(fwd_write_26bytes):
2149	movq	-26(%eax), %xmm0
2150	movq	%xmm0, -26(%edx)
2151L(fwd_write_18bytes):
2152	movq	-18(%eax), %xmm0
2153	movq	%xmm0, -18(%edx)
2154L(fwd_write_10bytes):
2155	movq	-10(%eax), %xmm0
2156	movq	%xmm0, -10(%edx)
2157L(fwd_write_2bytes):
2158	movzwl	-2(%eax), %ecx
2159	movw	%cx, -2(%edx)
2160#ifdef USE_AS_MEMPCPY
2161	movl	%edx, %eax
2162#else
2163	movl	DEST(%esp), %eax
2164#endif
2165	RETURN
2166
2167	.p2align 4
2168L(fwd_write_47bytes):
2169	movq	-47(%eax), %xmm0
2170	movq	%xmm0, -47(%edx)
2171L(fwd_write_39bytes):
2172	movq	-39(%eax), %xmm0
2173	movq	%xmm0, -39(%edx)
2174L(fwd_write_31bytes):
2175	movq	-31(%eax), %xmm0
2176	movq	%xmm0, -31(%edx)
2177L(fwd_write_23bytes):
2178	movq	-23(%eax), %xmm0
2179	movq	%xmm0, -23(%edx)
2180L(fwd_write_15bytes):
2181	movq	-15(%eax), %xmm0
2182	movq	%xmm0, -15(%edx)
2183L(fwd_write_7bytes):
2184	movl	-7(%eax), %ecx
2185	movl	%ecx, -7(%edx)
2186	movzwl	-3(%eax), %ecx
2187	movzbl	-1(%eax), %eax
2188	movw	%cx, -3(%edx)
2189	movb	%al, -1(%edx)
2190#ifdef USE_AS_MEMPCPY
2191	movl	%edx, %eax
2192#else
2193	movl	DEST(%esp), %eax
2194#endif
2195	RETURN
2196
2197	.p2align 4
2198L(fwd_write_43bytes):
2199	movq	-43(%eax), %xmm0
2200	movq	%xmm0, -43(%edx)
2201L(fwd_write_35bytes):
2202	movq	-35(%eax), %xmm0
2203	movq	%xmm0, -35(%edx)
2204L(fwd_write_27bytes):
2205	movq	-27(%eax), %xmm0
2206	movq	%xmm0, -27(%edx)
2207L(fwd_write_19bytes):
2208	movq	-19(%eax), %xmm0
2209	movq	%xmm0, -19(%edx)
2210L(fwd_write_11bytes):
2211	movq	-11(%eax), %xmm0
2212	movq	%xmm0, -11(%edx)
2213L(fwd_write_3bytes):
2214	movzwl	-3(%eax), %ecx
2215	movzbl	-1(%eax), %eax
2216	movw	%cx, -3(%edx)
2217	movb	%al, -1(%edx)
2218#ifdef USE_AS_MEMPCPY
2219	movl	%edx, %eax
2220#else
2221	movl	DEST(%esp), %eax
2222#endif
2223	RETURN
2224
2225	.p2align 4
2226L(fwd_write_40bytes_align):
2227	movdqa	-40(%eax), %xmm0
2228	movdqa	%xmm0, -40(%edx)
2229L(fwd_write_24bytes_align):
2230	movdqa	-24(%eax), %xmm0
2231	movdqa	%xmm0, -24(%edx)
2232L(fwd_write_8bytes_align):
2233	movq	-8(%eax), %xmm0
2234	movq	%xmm0, -8(%edx)
2235L(fwd_write_0bytes_align):
2236#ifdef USE_AS_MEMPCPY
2237	movl	%edx, %eax
2238#else
2239	movl	DEST(%esp), %eax
2240#endif
2241	RETURN
2242
2243	.p2align 4
2244L(fwd_write_32bytes_align):
2245	movdqa	-32(%eax), %xmm0
2246	movdqa	%xmm0, -32(%edx)
2247L(fwd_write_16bytes_align):
2248	movdqa	-16(%eax), %xmm0
2249	movdqa	%xmm0, -16(%edx)
2250#ifdef USE_AS_MEMPCPY
2251	movl	%edx, %eax
2252#else
2253	movl	DEST(%esp), %eax
2254#endif
2255	RETURN
2256
2257	.p2align 4
2258L(fwd_write_5bytes_align):
2259	movl	-5(%eax), %ecx
2260	movl	-4(%eax), %eax
2261	movl	%ecx, -5(%edx)
2262	movl	%eax, -4(%edx)
2263#ifdef USE_AS_MEMPCPY
2264	movl	%edx, %eax
2265#else
2266	movl	DEST(%esp), %eax
2267#endif
2268	RETURN
2269
2270	.p2align 4
2271L(fwd_write_45bytes_align):
2272	movdqa	-45(%eax), %xmm0
2273	movdqa	%xmm0, -45(%edx)
2274L(fwd_write_29bytes_align):
2275	movdqa	-29(%eax), %xmm0
2276	movdqa	%xmm0, -29(%edx)
2277L(fwd_write_13bytes_align):
2278	movq	-13(%eax), %xmm0
2279	movq	%xmm0, -13(%edx)
2280	movl	-5(%eax), %ecx
2281	movl	%ecx, -5(%edx)
2282	movzbl	-1(%eax), %ecx
2283	movb	%cl, -1(%edx)
2284#ifdef USE_AS_MEMPCPY
2285	movl	%edx, %eax
2286#else
2287	movl	DEST(%esp), %eax
2288#endif
2289	RETURN
2290
2291	.p2align 4
2292L(fwd_write_37bytes_align):
2293	movdqa	-37(%eax), %xmm0
2294	movdqa	%xmm0, -37(%edx)
2295L(fwd_write_21bytes_align):
2296	movdqa	-21(%eax), %xmm0
2297	movdqa	%xmm0, -21(%edx)
2298	movl	-5(%eax), %ecx
2299	movl	%ecx, -5(%edx)
2300	movzbl	-1(%eax), %ecx
2301	movb	%cl, -1(%edx)
2302#ifdef USE_AS_MEMPCPY
2303	movl	%edx, %eax
2304#else
2305	movl	DEST(%esp), %eax
2306#endif
2307	RETURN
2308
2309	.p2align 4
2310L(fwd_write_41bytes_align):
2311	movdqa	-41(%eax), %xmm0
2312	movdqa	%xmm0, -41(%edx)
2313L(fwd_write_25bytes_align):
2314	movdqa	-25(%eax), %xmm0
2315	movdqa	%xmm0, -25(%edx)
2316L(fwd_write_9bytes_align):
2317	movq	-9(%eax), %xmm0
2318	movq	%xmm0, -9(%edx)
2319L(fwd_write_1bytes_align):
2320	movzbl	-1(%eax), %ecx
2321	movb	%cl, -1(%edx)
2322#ifdef USE_AS_MEMPCPY
2323	movl	%edx, %eax
2324#else
2325	movl	DEST(%esp), %eax
2326#endif
2327	RETURN
2328
2329	.p2align 4
2330L(fwd_write_33bytes_align):
2331	movdqa	-33(%eax), %xmm0
2332	movdqa	%xmm0, -33(%edx)
2333L(fwd_write_17bytes_align):
2334	movdqa	-17(%eax), %xmm0
2335	movdqa	%xmm0, -17(%edx)
2336	movzbl	-1(%eax), %ecx
2337	movb	%cl, -1(%edx)
2338#ifdef USE_AS_MEMPCPY
2339	movl	%edx, %eax
2340#else
2341	movl	DEST(%esp), %eax
2342#endif
2343	RETURN
2344
2345	.p2align 4
2346L(fwd_write_46bytes_align):
2347	movdqa	-46(%eax), %xmm0
2348	movdqa	%xmm0, -46(%edx)
2349L(fwd_write_30bytes_align):
2350	movdqa	-30(%eax), %xmm0
2351	movdqa	%xmm0, -30(%edx)
2352L(fwd_write_14bytes_align):
2353	movq	-14(%eax), %xmm0
2354	movq	%xmm0, -14(%edx)
2355L(fwd_write_6bytes_align):
2356	movl	-6(%eax), %ecx
2357	movl	%ecx, -6(%edx)
2358	movzwl	-2(%eax), %ecx
2359	movw	%cx, -2(%edx)
2360#ifdef USE_AS_MEMPCPY
2361	movl	%edx, %eax
2362#else
2363	movl	DEST(%esp), %eax
2364#endif
2365	RETURN
2366
2367	.p2align 4
2368L(fwd_write_38bytes_align):
2369	movdqa	-38(%eax), %xmm0
2370	movdqa	%xmm0, -38(%edx)
2371L(fwd_write_22bytes_align):
2372	movdqa	-22(%eax), %xmm0
2373	movdqa	%xmm0, -22(%edx)
2374	movl	-6(%eax), %ecx
2375	movl	%ecx, -6(%edx)
2376	movzwl	-2(%eax), %ecx
2377	movw	%cx, -2(%edx)
2378#ifdef USE_AS_MEMPCPY
2379	movl	%edx, %eax
2380#else
2381	movl	DEST(%esp), %eax
2382#endif
2383	RETURN
2384
2385	.p2align 4
2386L(fwd_write_42bytes_align):
2387	movdqa	-42(%eax), %xmm0
2388	movdqa	%xmm0, -42(%edx)
2389L(fwd_write_26bytes_align):
2390	movdqa	-26(%eax), %xmm0
2391	movdqa	%xmm0, -26(%edx)
2392L(fwd_write_10bytes_align):
2393	movq	-10(%eax), %xmm0
2394	movq	%xmm0, -10(%edx)
2395L(fwd_write_2bytes_align):
2396	movzwl	-2(%eax), %ecx
2397	movw	%cx, -2(%edx)
2398#ifdef USE_AS_MEMPCPY
2399	movl	%edx, %eax
2400#else
2401	movl	DEST(%esp), %eax
2402#endif
2403	RETURN
2404
2405	.p2align 4
2406L(fwd_write_34bytes_align):
2407	movdqa	-34(%eax), %xmm0
2408	movdqa	%xmm0, -34(%edx)
2409L(fwd_write_18bytes_align):
2410	movdqa	-18(%eax), %xmm0
2411	movdqa	%xmm0, -18(%edx)
2412	movzwl	-2(%eax), %ecx
2413	movw	%cx, -2(%edx)
2414#ifdef USE_AS_MEMPCPY
2415	movl	%edx, %eax
2416#else
2417	movl	DEST(%esp), %eax
2418#endif
2419	RETURN
2420
2421	.p2align 4
2422L(fwd_write_47bytes_align):
2423	movdqa	-47(%eax), %xmm0
2424	movdqa	%xmm0, -47(%edx)
2425L(fwd_write_31bytes_align):
2426	movdqa	-31(%eax), %xmm0
2427	movdqa	%xmm0, -31(%edx)
2428L(fwd_write_15bytes_align):
2429	movq	-15(%eax), %xmm0
2430	movq	%xmm0, -15(%edx)
2431L(fwd_write_7bytes_align):
2432	movl	-7(%eax), %ecx
2433	movl	%ecx, -7(%edx)
2434	movzwl	-3(%eax), %ecx
2435	movzbl	-1(%eax), %eax
2436	movw	%cx, -3(%edx)
2437	movb	%al, -1(%edx)
2438#ifdef USE_AS_MEMPCPY
2439	movl	%edx, %eax
2440#else
2441	movl	DEST(%esp), %eax
2442#endif
2443	RETURN
2444
2445	.p2align 4
2446L(fwd_write_39bytes_align):
2447	movdqa	-39(%eax), %xmm0
2448	movdqa	%xmm0, -39(%edx)
2449L(fwd_write_23bytes_align):
2450	movdqa	-23(%eax), %xmm0
2451	movdqa	%xmm0, -23(%edx)
2452	movl	-7(%eax), %ecx
2453	movl	%ecx, -7(%edx)
2454	movzwl	-3(%eax), %ecx
2455	movzbl	-1(%eax), %eax
2456	movw	%cx, -3(%edx)
2457	movb	%al, -1(%edx)
2458#ifdef USE_AS_MEMPCPY
2459	movl	%edx, %eax
2460#else
2461	movl	DEST(%esp), %eax
2462#endif
2463	RETURN
2464
2465	.p2align 4
2466L(fwd_write_43bytes_align):
2467	movdqa	-43(%eax), %xmm0
2468	movdqa	%xmm0, -43(%edx)
2469L(fwd_write_27bytes_align):
2470	movdqa	-27(%eax), %xmm0
2471	movdqa	%xmm0, -27(%edx)
2472L(fwd_write_11bytes_align):
2473	movq	-11(%eax), %xmm0
2474	movq	%xmm0, -11(%edx)
2475L(fwd_write_3bytes_align):
2476	movzwl	-3(%eax), %ecx
2477	movzbl	-1(%eax), %eax
2478	movw	%cx, -3(%edx)
2479	movb	%al, -1(%edx)
2480#ifdef USE_AS_MEMPCPY
2481	movl	%edx, %eax
2482#else
2483	movl	DEST(%esp), %eax
2484#endif
2485	RETURN
2486
2487	.p2align 4
2488L(fwd_write_35bytes_align):
2489	movdqa	-35(%eax), %xmm0
2490	movdqa	%xmm0, -35(%edx)
2491L(fwd_write_19bytes_align):
2492	movdqa	-19(%eax), %xmm0
2493	movdqa	%xmm0, -19(%edx)
2494	movzwl	-3(%eax), %ecx
2495	movzbl	-1(%eax), %eax
2496	movw	%cx, -3(%edx)
2497	movb	%al, -1(%edx)
2498#ifdef USE_AS_MEMPCPY
2499	movl	%edx, %eax
2500#else
2501	movl	DEST(%esp), %eax
2502#endif
2503	RETURN
2504
2505	.p2align 4
2506L(fwd_write_44bytes_align):
2507	movdqa	-44(%eax), %xmm0
2508	movdqa	%xmm0, -44(%edx)
2509L(fwd_write_28bytes_align):
2510	movdqa	-28(%eax), %xmm0
2511	movdqa	%xmm0, -28(%edx)
2512L(fwd_write_12bytes_align):
2513	movq	-12(%eax), %xmm0
2514	movq	%xmm0, -12(%edx)
2515L(fwd_write_4bytes_align):
2516	movl	-4(%eax), %ecx
2517	movl	%ecx, -4(%edx)
2518#ifdef USE_AS_MEMPCPY
2519	movl	%edx, %eax
2520#else
2521	movl	DEST(%esp), %eax
2522#endif
2523	RETURN
2524
2525	.p2align 4
2526L(fwd_write_36bytes_align):
2527	movdqa	-36(%eax), %xmm0
2528	movdqa	%xmm0, -36(%edx)
2529L(fwd_write_20bytes_align):
2530	movdqa	-20(%eax), %xmm0
2531	movdqa	%xmm0, -20(%edx)
2532	movl	-4(%eax), %ecx
2533	movl	%ecx, -4(%edx)
2534#ifdef USE_AS_MEMPCPY
2535	movl	%edx, %eax
2536#else
2537	movl	DEST(%esp), %eax
2538#endif
2539	RETURN_END
2540
2541	CFI_PUSH (%edi)
2542
2543	.p2align 4
2544L(large_page):
2545	movdqu	(%eax), %xmm1
2546#ifdef USE_AS_MEMMOVE
2547	movl	DEST+4(%esp), %edi
2548	movdqu	%xmm0, (%edi)
2549#endif
2550	lea	16(%eax), %eax
2551	movntdq	%xmm1, (%edx)
2552	lea	16(%edx), %edx
2553	lea	-0x90(%ecx), %ecx
2554	POP (%edi)
2555
2556	.p2align 4
2557L(large_page_loop):
2558	movdqu	(%eax), %xmm0
2559	movdqu	0x10(%eax), %xmm1
2560	movdqu	0x20(%eax), %xmm2
2561	movdqu	0x30(%eax), %xmm3
2562	movdqu	0x40(%eax), %xmm4
2563	movdqu	0x50(%eax), %xmm5
2564	movdqu	0x60(%eax), %xmm6
2565	movdqu	0x70(%eax), %xmm7
2566	lea	0x80(%eax), %eax
2567
2568	sub	$0x80, %ecx
2569	movntdq	%xmm0, (%edx)
2570	movntdq	%xmm1, 0x10(%edx)
2571	movntdq	%xmm2, 0x20(%edx)
2572	movntdq	%xmm3, 0x30(%edx)
2573	movntdq	%xmm4, 0x40(%edx)
2574	movntdq	%xmm5, 0x50(%edx)
2575	movntdq	%xmm6, 0x60(%edx)
2576	movntdq	%xmm7, 0x70(%edx)
2577	lea	0x80(%edx), %edx
2578	jae	L(large_page_loop)
2579	cmp	$-0x40, %ecx
2580	lea	0x80(%ecx), %ecx
2581	jl	L(large_page_less_64bytes)
2582
2583	movdqu	(%eax), %xmm0
2584	movdqu	0x10(%eax), %xmm1
2585	movdqu	0x20(%eax), %xmm2
2586	movdqu	0x30(%eax), %xmm3
2587	lea	0x40(%eax), %eax
2588
2589	movntdq	%xmm0, (%edx)
2590	movntdq	%xmm1, 0x10(%edx)
2591	movntdq	%xmm2, 0x20(%edx)
2592	movntdq	%xmm3, 0x30(%edx)
2593	lea	0x40(%edx), %edx
2594	sub	$0x40, %ecx
2595L(large_page_less_64bytes):
2596	cmp	$32, %ecx
2597	jb	L(large_page_less_32bytes)
2598	movdqu	(%eax), %xmm0
2599	movdqu	0x10(%eax), %xmm1
2600	lea	0x20(%eax), %eax
2601	movntdq	%xmm0, (%edx)
2602	movntdq	%xmm1, 0x10(%edx)
2603	lea	0x20(%edx), %edx
2604	sub	$0x20, %ecx
2605L(large_page_less_32bytes):
2606	add	%ecx, %edx
2607	add	%ecx, %eax
2608	sfence
2609	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2610
2611	.p2align 4
2612L(bk_write_44bytes):
2613	movq	36(%eax), %xmm0
2614	movq	%xmm0, 36(%edx)
2615L(bk_write_36bytes):
2616	movq	28(%eax), %xmm0
2617	movq	%xmm0, 28(%edx)
2618L(bk_write_28bytes):
2619	movq	20(%eax), %xmm0
2620	movq	%xmm0, 20(%edx)
2621L(bk_write_20bytes):
2622	movq	12(%eax), %xmm0
2623	movq	%xmm0, 12(%edx)
2624L(bk_write_12bytes):
2625	movq	4(%eax), %xmm0
2626	movq	%xmm0, 4(%edx)
2627L(bk_write_4bytes):
2628	movl	(%eax), %ecx
2629	movl	%ecx, (%edx)
2630L(bk_write_0bytes):
2631	movl	DEST(%esp), %eax
2632#ifdef USE_AS_MEMPCPY
2633	movl	LEN(%esp), %ecx
2634	add	%ecx, %eax
2635#endif
2636	RETURN
2637
2638	.p2align 4
2639L(bk_write_40bytes):
2640	movq	32(%eax), %xmm0
2641	movq	%xmm0, 32(%edx)
2642L(bk_write_32bytes):
2643	movq	24(%eax), %xmm0
2644	movq	%xmm0, 24(%edx)
2645L(bk_write_24bytes):
2646	movq	16(%eax), %xmm0
2647	movq	%xmm0, 16(%edx)
2648L(bk_write_16bytes):
2649	movq	8(%eax), %xmm0
2650	movq	%xmm0, 8(%edx)
2651L(bk_write_8bytes):
2652	movq	(%eax), %xmm0
2653	movq	%xmm0, (%edx)
2654	movl	DEST(%esp), %eax
2655#ifdef USE_AS_MEMPCPY
2656	movl	LEN(%esp), %ecx
2657	add	%ecx, %eax
2658#endif
2659	RETURN
2660
2661	.p2align 4
2662L(bk_write_45bytes):
2663	movq	37(%eax), %xmm0
2664	movq	%xmm0, 37(%edx)
2665L(bk_write_37bytes):
2666	movq	29(%eax), %xmm0
2667	movq	%xmm0, 29(%edx)
2668L(bk_write_29bytes):
2669	movq	21(%eax), %xmm0
2670	movq	%xmm0, 21(%edx)
2671L(bk_write_21bytes):
2672	movq	13(%eax), %xmm0
2673	movq	%xmm0, 13(%edx)
2674L(bk_write_13bytes):
2675	movq	5(%eax), %xmm0
2676	movq	%xmm0, 5(%edx)
2677L(bk_write_5bytes):
2678	movl	1(%eax), %ecx
2679	movl	%ecx, 1(%edx)
2680L(bk_write_1bytes):
2681	movzbl	(%eax), %ecx
2682	movb	%cl, (%edx)
2683	movl	DEST(%esp), %eax
2684#ifdef USE_AS_MEMPCPY
2685	movl	LEN(%esp), %ecx
2686	add	%ecx, %eax
2687#endif
2688	RETURN
2689
2690	.p2align 4
2691L(bk_write_41bytes):
2692	movq	33(%eax), %xmm0
2693	movq	%xmm0, 33(%edx)
2694L(bk_write_33bytes):
2695	movq	25(%eax), %xmm0
2696	movq	%xmm0, 25(%edx)
2697L(bk_write_25bytes):
2698	movq	17(%eax), %xmm0
2699	movq	%xmm0, 17(%edx)
2700L(bk_write_17bytes):
2701	movq	9(%eax), %xmm0
2702	movq	%xmm0, 9(%edx)
2703L(bk_write_9bytes):
2704	movq	1(%eax), %xmm0
2705	movq	%xmm0, 1(%edx)
2706	movzbl	(%eax), %ecx
2707	movb	%cl, (%edx)
2708	movl	DEST(%esp), %eax
2709#ifdef USE_AS_MEMPCPY
2710	movl	LEN(%esp), %ecx
2711	add	%ecx, %eax
2712#endif
2713	RETURN
2714
2715	.p2align 4
2716L(bk_write_46bytes):
2717	movq	38(%eax), %xmm0
2718	movq	%xmm0, 38(%edx)
2719L(bk_write_38bytes):
2720	movq	30(%eax), %xmm0
2721	movq	%xmm0, 30(%edx)
2722L(bk_write_30bytes):
2723	movq	22(%eax), %xmm0
2724	movq	%xmm0, 22(%edx)
2725L(bk_write_22bytes):
2726	movq	14(%eax), %xmm0
2727	movq	%xmm0, 14(%edx)
2728L(bk_write_14bytes):
2729	movq	6(%eax), %xmm0
2730	movq	%xmm0, 6(%edx)
2731L(bk_write_6bytes):
2732	movl	2(%eax), %ecx
2733	movl	%ecx, 2(%edx)
2734	movzwl	(%eax), %ecx
2735	movw	%cx, (%edx)
2736	movl	DEST(%esp), %eax
2737#ifdef USE_AS_MEMPCPY
2738	movl	LEN(%esp), %ecx
2739	add	%ecx, %eax
2740#endif
2741	RETURN
2742
2743	.p2align 4
2744L(bk_write_42bytes):
2745	movq	34(%eax), %xmm0
2746	movq	%xmm0, 34(%edx)
2747L(bk_write_34bytes):
2748	movq	26(%eax), %xmm0
2749	movq	%xmm0, 26(%edx)
2750L(bk_write_26bytes):
2751	movq	18(%eax), %xmm0
2752	movq	%xmm0, 18(%edx)
2753L(bk_write_18bytes):
2754	movq	10(%eax), %xmm0
2755	movq	%xmm0, 10(%edx)
2756L(bk_write_10bytes):
2757	movq	2(%eax), %xmm0
2758	movq	%xmm0, 2(%edx)
2759L(bk_write_2bytes):
2760	movzwl	(%eax), %ecx
2761	movw	%cx, (%edx)
2762	movl	DEST(%esp), %eax
2763#ifdef USE_AS_MEMPCPY
2764	movl	LEN(%esp), %ecx
2765	add	%ecx, %eax
2766#endif
2767	RETURN
2768
2769	.p2align 4
2770L(bk_write_47bytes):
2771	movq	39(%eax), %xmm0
2772	movq	%xmm0, 39(%edx)
2773L(bk_write_39bytes):
2774	movq	31(%eax), %xmm0
2775	movq	%xmm0, 31(%edx)
2776L(bk_write_31bytes):
2777	movq	23(%eax), %xmm0
2778	movq	%xmm0, 23(%edx)
2779L(bk_write_23bytes):
2780	movq	15(%eax), %xmm0
2781	movq	%xmm0, 15(%edx)
2782L(bk_write_15bytes):
2783	movq	7(%eax), %xmm0
2784	movq	%xmm0, 7(%edx)
2785L(bk_write_7bytes):
2786	movl	3(%eax), %ecx
2787	movl	%ecx, 3(%edx)
2788	movzwl	1(%eax), %ecx
2789	movw	%cx, 1(%edx)
2790	movzbl	(%eax), %eax
2791	movb	%al, (%edx)
2792	movl	DEST(%esp), %eax
2793#ifdef USE_AS_MEMPCPY
2794	movl	LEN(%esp), %ecx
2795	add	%ecx, %eax
2796#endif
2797	RETURN
2798
2799	.p2align 4
2800L(bk_write_43bytes):
2801	movq	35(%eax), %xmm0
2802	movq	%xmm0, 35(%edx)
2803L(bk_write_35bytes):
2804	movq	27(%eax), %xmm0
2805	movq	%xmm0, 27(%edx)
2806L(bk_write_27bytes):
2807	movq	19(%eax), %xmm0
2808	movq	%xmm0, 19(%edx)
2809L(bk_write_19bytes):
2810	movq	11(%eax), %xmm0
2811	movq	%xmm0, 11(%edx)
2812L(bk_write_11bytes):
2813	movq	3(%eax), %xmm0
2814	movq	%xmm0, 3(%edx)
2815L(bk_write_3bytes):
2816	movzwl	1(%eax), %ecx
2817	movw	%cx, 1(%edx)
2818	movzbl	(%eax), %eax
2819	movb	%al, (%edx)
2820	movl	DEST(%esp), %eax
2821#ifdef USE_AS_MEMPCPY
2822	movl	LEN(%esp), %ecx
2823	add	%ecx, %eax
2824#endif
2825	RETURN_END
2826
2827
2828	.pushsection .rodata.ssse3,"a",@progbits
2829	.p2align 2
2830L(table_48bytes_fwd):
2831	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2832	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2833	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2834	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2835	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2836	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2837	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2838	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2839	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2840	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2841	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2842	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2843	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2844	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2845	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2846	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2847	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2848	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2849	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2850	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2851	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2852	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2853	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2854	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2855	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2856	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2857	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2858	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2859	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2860	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2861	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2862	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2863	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2864	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2865	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2866	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2867	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2868	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2869	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2870	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2871	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2872	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2873	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2874	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2875	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2876	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2877	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2878	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2879
2880	.p2align 2
2881L(table_48bytes_fwd_align):
2882	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2883	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2884	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2885	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2886	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2887	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2888	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2889	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2890	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2891	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2892	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2893	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2894	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2895	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2896	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2897	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2898	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2899	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2900	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2901	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2902	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2903	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2904	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2905	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2906	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2907	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2908	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2909	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2910	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2911	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2912	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2913	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2914	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2915	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2916	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2917	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2918	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2919	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2920	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2921	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2922	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2923	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2924	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2925	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2926	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2927	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2928	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2929	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2930
2931	.p2align 2
2932L(shl_table):
2933	.int	JMPTBL (L(shl_0), L(shl_table))
2934	.int	JMPTBL (L(shl_1), L(shl_table))
2935	.int	JMPTBL (L(shl_2), L(shl_table))
2936	.int	JMPTBL (L(shl_3), L(shl_table))
2937	.int	JMPTBL (L(shl_4), L(shl_table))
2938	.int	JMPTBL (L(shl_5), L(shl_table))
2939	.int	JMPTBL (L(shl_6), L(shl_table))
2940	.int	JMPTBL (L(shl_7), L(shl_table))
2941	.int	JMPTBL (L(shl_8), L(shl_table))
2942	.int	JMPTBL (L(shl_9), L(shl_table))
2943	.int	JMPTBL (L(shl_10), L(shl_table))
2944	.int	JMPTBL (L(shl_11), L(shl_table))
2945	.int	JMPTBL (L(shl_12), L(shl_table))
2946	.int	JMPTBL (L(shl_13), L(shl_table))
2947	.int	JMPTBL (L(shl_14), L(shl_table))
2948	.int	JMPTBL (L(shl_15), L(shl_table))
2949
2950	.p2align 2
2951L(table_48_bytes_bwd):
2952	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2953	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2954	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2955	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2956	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2957	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2958	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2959	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2960	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2961	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2962	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2963	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2964	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2965	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
2966	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
2967	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
2968	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
2969	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
2970	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
2971	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
2972	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
2973	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
2974	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
2975	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
2976	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
2977	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
2978	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
2979	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
2980	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
2981	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
2982	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
2983	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
2984	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
2985	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
2986	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
2987	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
2988	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
2989	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
2990	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
2991	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
2992	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
2993	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
2994	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
2995	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
2996	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
2997	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
2998	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
2999	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3000
3001	.popsection
3002
3003#ifdef USE_AS_MEMMOVE
3004	.p2align 4
3005L(copy_backward):
3006	PUSH (%edi)
3007	movl	%eax, %edi
3008	lea	(%ecx,%edx,1),%edx
3009	lea	(%ecx,%edi,1),%edi
3010	testl	$0x3, %edx
3011	jnz	L(bk_align)
3012
3013L(bk_aligned_4):
3014	cmp	$64, %ecx
3015	jae	L(bk_write_more64bytes)
3016
3017L(bk_write_64bytesless):
3018	cmp	$32, %ecx
3019	jb	L(bk_write_less32bytes)
3020
3021L(bk_write_more32bytes):
3022	/* Copy 32 bytes at a time.  */
3023	sub	$32, %ecx
3024	movq	-8(%edi), %xmm0
3025	movq	%xmm0, -8(%edx)
3026	movq	-16(%edi), %xmm0
3027	movq	%xmm0, -16(%edx)
3028	movq	-24(%edi), %xmm0
3029	movq	%xmm0, -24(%edx)
3030	movq	-32(%edi), %xmm0
3031	movq	%xmm0, -32(%edx)
3032	sub	$32, %edx
3033	sub	$32, %edi
3034
3035L(bk_write_less32bytes):
3036	movl	%edi, %eax
3037	sub	%ecx, %edx
3038	sub	%ecx, %eax
3039	POP (%edi)
3040L(bk_write_less32bytes_2):
3041	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3042
3043	CFI_PUSH (%edi)
3044
3045	.p2align 4
3046L(bk_align):
3047	cmp	$8, %ecx
3048	jbe	L(bk_write_less32bytes)
3049	testl	$1, %edx
3050	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3051	then	(EDX & 2) must be != 0.  */
3052	jz	L(bk_got2)
3053	sub	$1, %edi
3054	sub	$1, %ecx
3055	sub	$1, %edx
3056	movzbl	(%edi), %eax
3057	movb	%al, (%edx)
3058
3059	testl	$2, %edx
3060	jz	L(bk_aligned_4)
3061
3062L(bk_got2):
3063	sub	$2, %edi
3064	sub	$2, %ecx
3065	sub	$2, %edx
3066	movzwl	(%edi), %eax
3067	movw	%ax, (%edx)
3068	jmp	L(bk_aligned_4)
3069
3070	.p2align 4
3071L(bk_write_more64bytes):
3072	/* Check alignment of last byte.  */
3073	testl	$15, %edx
3074	jz	L(bk_ssse3_cpy_pre)
3075
3076/* EDX is aligned 4 bytes, but not 16 bytes.  */
3077L(bk_ssse3_align):
3078	sub	$4, %edi
3079	sub	$4, %ecx
3080	sub	$4, %edx
3081	movl	(%edi), %eax
3082	movl	%eax, (%edx)
3083
3084	testl	$15, %edx
3085	jz	L(bk_ssse3_cpy_pre)
3086
3087	sub	$4, %edi
3088	sub	$4, %ecx
3089	sub	$4, %edx
3090	movl	(%edi), %eax
3091	movl	%eax, (%edx)
3092
3093	testl	$15, %edx
3094	jz	L(bk_ssse3_cpy_pre)
3095
3096	sub	$4, %edi
3097	sub	$4, %ecx
3098	sub	$4, %edx
3099	movl	(%edi), %eax
3100	movl	%eax, (%edx)
3101
3102L(bk_ssse3_cpy_pre):
3103	cmp	$64, %ecx
3104	jb	L(bk_write_more32bytes)
3105
3106	.p2align 4
3107L(bk_ssse3_cpy):
3108	sub	$64, %edi
3109	sub	$64, %ecx
3110	sub	$64, %edx
3111	movdqu	0x30(%edi), %xmm3
3112	movdqa	%xmm3, 0x30(%edx)
3113	movdqu	0x20(%edi), %xmm2
3114	movdqa	%xmm2, 0x20(%edx)
3115	movdqu	0x10(%edi), %xmm1
3116	movdqa	%xmm1, 0x10(%edx)
3117	movdqu	(%edi), %xmm0
3118	movdqa	%xmm0, (%edx)
3119	cmp	$64, %ecx
3120	jae	L(bk_ssse3_cpy)
3121	jmp	L(bk_write_64bytesless)
3122
3123#endif
3124
3125END (MEMCPY)
3126