1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include <private/bionic_asm.h>
32
33#define FOR_ATOM
34#include "cache.h"
35
36#ifndef L
37# define L(label)	.L##label
38#endif
39
40#ifndef ALIGN
41# define ALIGN(n)	.p2align n
42#endif
43
44#define CFI_PUSH(REG)						\
45  .cfi_adjust_cfa_offset 4;					\
46  .cfi_rel_offset REG, 0
47
48#define CFI_POP(REG)						\
49  .cfi_adjust_cfa_offset -4;					\
50  .cfi_restore REG
51
52#define PUSH(REG)	pushl REG; CFI_PUSH(REG)
53#define POP(REG)	popl REG; CFI_POP(REG)
54
55#define PARMS 8  /* Preserve EBX. */
56#define DST PARMS
57#define CHR (DST+4)
58#define LEN (CHR+4)
59#define CHK_DST_LEN (LEN+4)
60#define SETRTNVAL	movl DST(%esp), %eax
61
62#define ENTRANCE	PUSH(%ebx);
63#define RETURN_END	POP(%ebx); ret
64#define RETURN		RETURN_END; CFI_PUSH(%ebx)
65#define JMPTBL(I, B)	I - B
66
67/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
68   jump table with relative offsets.   */
69# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
70    /* We first load PC into EBX.  */				\
71    call	__x86.get_pc_thunk.bx;				\
72    /* Get the address of the jump table.  */			\
73    add		$(TABLE - .), %ebx;				\
74    /* Get the entry and convert the relative offset to the	\
75       absolute address.  */					\
76    add		(%ebx,%ecx,4), %ebx;				\
77    add		%ecx, %edx;					\
78    /* We loaded the jump table and adjusted EDX. Go.  */	\
79    jmp		*%ebx
80
81ENTRY(__memset_chk_atom)
82  ENTRANCE
83
84  movl LEN(%esp), %ecx
85  cmpl CHK_DST_LEN(%esp), %ecx
86  jna L(memset_length_loaded)
87
88  POP(%ebx) // Undo ENTRANCE without returning.
89  jmp __memset_chk_fail
90END(__memset_chk_atom)
91
92	.section .text.sse2,"ax",@progbits
93	ALIGN(4)
94ENTRY(memset_atom)
95	ENTRANCE
96
97	movl	LEN(%esp), %ecx
98L(memset_length_loaded):
99	movzbl	CHR(%esp), %eax
100	movb	%al, %ah
101	/* Fill the whole EAX with pattern.  */
102	movl	%eax, %edx
103	shl	$16, %eax
104	or	%edx, %eax
105	movl	DST(%esp), %edx
106	cmp	$32, %ecx
107	jae	L(32bytesormore)
108
109L(write_less32bytes):
110	BRANCH_TO_JMPTBL_ENTRY(L(table_less_32bytes))
111
112
113	.pushsection .rodata.sse2,"a",@progbits
114	ALIGN(2)
115L(table_less_32bytes):
116	.int	JMPTBL(L(write_0bytes), L(table_less_32bytes))
117	.int	JMPTBL(L(write_1bytes), L(table_less_32bytes))
118	.int	JMPTBL(L(write_2bytes), L(table_less_32bytes))
119	.int	JMPTBL(L(write_3bytes), L(table_less_32bytes))
120	.int	JMPTBL(L(write_4bytes), L(table_less_32bytes))
121	.int	JMPTBL(L(write_5bytes), L(table_less_32bytes))
122	.int	JMPTBL(L(write_6bytes), L(table_less_32bytes))
123	.int	JMPTBL(L(write_7bytes), L(table_less_32bytes))
124	.int	JMPTBL(L(write_8bytes), L(table_less_32bytes))
125	.int	JMPTBL(L(write_9bytes), L(table_less_32bytes))
126	.int	JMPTBL(L(write_10bytes), L(table_less_32bytes))
127	.int	JMPTBL(L(write_11bytes), L(table_less_32bytes))
128	.int	JMPTBL(L(write_12bytes), L(table_less_32bytes))
129	.int	JMPTBL(L(write_13bytes), L(table_less_32bytes))
130	.int	JMPTBL(L(write_14bytes), L(table_less_32bytes))
131	.int	JMPTBL(L(write_15bytes), L(table_less_32bytes))
132	.int	JMPTBL(L(write_16bytes), L(table_less_32bytes))
133	.int	JMPTBL(L(write_17bytes), L(table_less_32bytes))
134	.int	JMPTBL(L(write_18bytes), L(table_less_32bytes))
135	.int	JMPTBL(L(write_19bytes), L(table_less_32bytes))
136	.int	JMPTBL(L(write_20bytes), L(table_less_32bytes))
137	.int	JMPTBL(L(write_21bytes), L(table_less_32bytes))
138	.int	JMPTBL(L(write_22bytes), L(table_less_32bytes))
139	.int	JMPTBL(L(write_23bytes), L(table_less_32bytes))
140	.int	JMPTBL(L(write_24bytes), L(table_less_32bytes))
141	.int	JMPTBL(L(write_25bytes), L(table_less_32bytes))
142	.int	JMPTBL(L(write_26bytes), L(table_less_32bytes))
143	.int	JMPTBL(L(write_27bytes), L(table_less_32bytes))
144	.int	JMPTBL(L(write_28bytes), L(table_less_32bytes))
145	.int	JMPTBL(L(write_29bytes), L(table_less_32bytes))
146	.int	JMPTBL(L(write_30bytes), L(table_less_32bytes))
147	.int	JMPTBL(L(write_31bytes), L(table_less_32bytes))
148	.popsection
149
150	ALIGN(4)
151L(write_28bytes):
152	movl	%eax, -28(%edx)
153L(write_24bytes):
154	movl	%eax, -24(%edx)
155L(write_20bytes):
156	movl	%eax, -20(%edx)
157L(write_16bytes):
158	movl	%eax, -16(%edx)
159L(write_12bytes):
160	movl	%eax, -12(%edx)
161L(write_8bytes):
162	movl	%eax, -8(%edx)
163L(write_4bytes):
164	movl	%eax, -4(%edx)
165L(write_0bytes):
166	SETRTNVAL
167	RETURN
168
169	ALIGN(4)
170L(write_29bytes):
171	movl	%eax, -29(%edx)
172L(write_25bytes):
173	movl	%eax, -25(%edx)
174L(write_21bytes):
175	movl	%eax, -21(%edx)
176L(write_17bytes):
177	movl	%eax, -17(%edx)
178L(write_13bytes):
179	movl	%eax, -13(%edx)
180L(write_9bytes):
181	movl	%eax, -9(%edx)
182L(write_5bytes):
183	movl	%eax, -5(%edx)
184L(write_1bytes):
185	movb	%al, -1(%edx)
186	SETRTNVAL
187	RETURN
188
189	ALIGN(4)
190L(write_30bytes):
191	movl	%eax, -30(%edx)
192L(write_26bytes):
193	movl	%eax, -26(%edx)
194L(write_22bytes):
195	movl	%eax, -22(%edx)
196L(write_18bytes):
197	movl	%eax, -18(%edx)
198L(write_14bytes):
199	movl	%eax, -14(%edx)
200L(write_10bytes):
201	movl	%eax, -10(%edx)
202L(write_6bytes):
203	movl	%eax, -6(%edx)
204L(write_2bytes):
205	movw	%ax, -2(%edx)
206	SETRTNVAL
207	RETURN
208
209	ALIGN(4)
210L(write_31bytes):
211	movl	%eax, -31(%edx)
212L(write_27bytes):
213	movl	%eax, -27(%edx)
214L(write_23bytes):
215	movl	%eax, -23(%edx)
216L(write_19bytes):
217	movl	%eax, -19(%edx)
218L(write_15bytes):
219	movl	%eax, -15(%edx)
220L(write_11bytes):
221	movl	%eax, -11(%edx)
222L(write_7bytes):
223	movl	%eax, -7(%edx)
224L(write_3bytes):
225	movw	%ax, -3(%edx)
226	movb	%al, -1(%edx)
227	SETRTNVAL
228	RETURN
229
230	ALIGN(4)
231/* ECX > 32 and EDX is 4 byte aligned.  */
232L(32bytesormore):
233	/* Fill xmm0 with the pattern.  */
234	movd	%eax, %xmm0
235	pshufd	$0, %xmm0, %xmm0
236	testl	$0xf, %edx
237	jz	L(aligned_16)
238/* ECX > 32 and EDX is not 16 byte aligned.  */
239L(not_aligned_16):
240	movdqu	%xmm0, (%edx)
241	movl	%edx, %eax
242	and	$-16, %edx
243	add	$16, %edx
244	sub	%edx, %eax
245	add	%eax, %ecx
246	movd	%xmm0, %eax
247
248	ALIGN(4)
249L(aligned_16):
250	cmp	$128, %ecx
251	jae	L(128bytesormore)
252
253L(aligned_16_less128bytes):
254	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
255
256	ALIGN(4)
257L(128bytesormore):
258	PUSH(%ebx)
259	mov	$SHARED_CACHE_SIZE, %ebx
260	cmp	%ebx, %ecx
261	jae	L(128bytesormore_nt_start)
262
263
264	POP(%ebx)
265# define RESTORE_EBX_STATE CFI_PUSH(%ebx)
266	cmp	$DATA_CACHE_SIZE, %ecx
267
268	jae	L(128bytes_L2_normal)
269	subl	$128, %ecx
270L(128bytesormore_normal):
271	sub	$128, %ecx
272	movdqa	%xmm0, (%edx)
273	movdqa	%xmm0, 0x10(%edx)
274	movdqa	%xmm0, 0x20(%edx)
275	movdqa	%xmm0, 0x30(%edx)
276	movdqa	%xmm0, 0x40(%edx)
277	movdqa	%xmm0, 0x50(%edx)
278	movdqa	%xmm0, 0x60(%edx)
279	movdqa	%xmm0, 0x70(%edx)
280	lea	128(%edx), %edx
281	jb	L(128bytesless_normal)
282
283
284	sub	$128, %ecx
285	movdqa	%xmm0, (%edx)
286	movdqa	%xmm0, 0x10(%edx)
287	movdqa	%xmm0, 0x20(%edx)
288	movdqa	%xmm0, 0x30(%edx)
289	movdqa	%xmm0, 0x40(%edx)
290	movdqa	%xmm0, 0x50(%edx)
291	movdqa	%xmm0, 0x60(%edx)
292	movdqa	%xmm0, 0x70(%edx)
293	lea	128(%edx), %edx
294	jae	L(128bytesormore_normal)
295
296L(128bytesless_normal):
297	add	$128, %ecx
298	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
299
300	ALIGN(4)
301L(128bytes_L2_normal):
302	prefetcht0	0x380(%edx)
303	prefetcht0	0x3c0(%edx)
304	sub	$128, %ecx
305	movdqa	%xmm0, (%edx)
306	movaps	%xmm0, 0x10(%edx)
307	movaps	%xmm0, 0x20(%edx)
308	movaps	%xmm0, 0x30(%edx)
309	movaps	%xmm0, 0x40(%edx)
310	movaps	%xmm0, 0x50(%edx)
311	movaps	%xmm0, 0x60(%edx)
312	movaps	%xmm0, 0x70(%edx)
313	add	$128, %edx
314	cmp	$128, %ecx
315	jae	L(128bytes_L2_normal)
316
317L(128bytesless_L2_normal):
318	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
319
320	RESTORE_EBX_STATE
321L(128bytesormore_nt_start):
322	sub	%ebx, %ecx
323	mov	%ebx, %eax
324	and	$0x7f, %eax
325	add	%eax, %ecx
326	movd	%xmm0, %eax
327	ALIGN(4)
328L(128bytesormore_shared_cache_loop):
329	prefetcht0	0x3c0(%edx)
330	prefetcht0	0x380(%edx)
331	sub	$0x80, %ebx
332	movdqa	%xmm0, (%edx)
333	movdqa	%xmm0, 0x10(%edx)
334	movdqa	%xmm0, 0x20(%edx)
335	movdqa	%xmm0, 0x30(%edx)
336	movdqa	%xmm0, 0x40(%edx)
337	movdqa	%xmm0, 0x50(%edx)
338	movdqa	%xmm0, 0x60(%edx)
339	movdqa	%xmm0, 0x70(%edx)
340	add	$0x80, %edx
341	cmp	$0x80, %ebx
342	jae	L(128bytesormore_shared_cache_loop)
343	cmp	$0x80, %ecx
344	jb	L(shared_cache_loop_end)
345	ALIGN(4)
346L(128bytesormore_nt):
347	sub	$0x80, %ecx
348	movntdq	%xmm0, (%edx)
349	movntdq	%xmm0, 0x10(%edx)
350	movntdq	%xmm0, 0x20(%edx)
351	movntdq	%xmm0, 0x30(%edx)
352	movntdq	%xmm0, 0x40(%edx)
353	movntdq	%xmm0, 0x50(%edx)
354	movntdq	%xmm0, 0x60(%edx)
355	movntdq	%xmm0, 0x70(%edx)
356	add	$0x80, %edx
357	cmp	$0x80, %ecx
358	jae	L(128bytesormore_nt)
359	sfence
360L(shared_cache_loop_end):
361	POP(%ebx)
362	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
363
364
365	.pushsection .rodata.sse2,"a",@progbits
366	ALIGN(2)
367L(table_16_128bytes):
368	.int	JMPTBL(L(aligned_16_0bytes), L(table_16_128bytes))
369	.int	JMPTBL(L(aligned_16_1bytes), L(table_16_128bytes))
370	.int	JMPTBL(L(aligned_16_2bytes), L(table_16_128bytes))
371	.int	JMPTBL(L(aligned_16_3bytes), L(table_16_128bytes))
372	.int	JMPTBL(L(aligned_16_4bytes), L(table_16_128bytes))
373	.int	JMPTBL(L(aligned_16_5bytes), L(table_16_128bytes))
374	.int	JMPTBL(L(aligned_16_6bytes), L(table_16_128bytes))
375	.int	JMPTBL(L(aligned_16_7bytes), L(table_16_128bytes))
376	.int	JMPTBL(L(aligned_16_8bytes), L(table_16_128bytes))
377	.int	JMPTBL(L(aligned_16_9bytes), L(table_16_128bytes))
378	.int	JMPTBL(L(aligned_16_10bytes), L(table_16_128bytes))
379	.int	JMPTBL(L(aligned_16_11bytes), L(table_16_128bytes))
380	.int	JMPTBL(L(aligned_16_12bytes), L(table_16_128bytes))
381	.int	JMPTBL(L(aligned_16_13bytes), L(table_16_128bytes))
382	.int	JMPTBL(L(aligned_16_14bytes), L(table_16_128bytes))
383	.int	JMPTBL(L(aligned_16_15bytes), L(table_16_128bytes))
384	.int	JMPTBL(L(aligned_16_16bytes), L(table_16_128bytes))
385	.int	JMPTBL(L(aligned_16_17bytes), L(table_16_128bytes))
386	.int	JMPTBL(L(aligned_16_18bytes), L(table_16_128bytes))
387	.int	JMPTBL(L(aligned_16_19bytes), L(table_16_128bytes))
388	.int	JMPTBL(L(aligned_16_20bytes), L(table_16_128bytes))
389	.int	JMPTBL(L(aligned_16_21bytes), L(table_16_128bytes))
390	.int	JMPTBL(L(aligned_16_22bytes), L(table_16_128bytes))
391	.int	JMPTBL(L(aligned_16_23bytes), L(table_16_128bytes))
392	.int	JMPTBL(L(aligned_16_24bytes), L(table_16_128bytes))
393	.int	JMPTBL(L(aligned_16_25bytes), L(table_16_128bytes))
394	.int	JMPTBL(L(aligned_16_26bytes), L(table_16_128bytes))
395	.int	JMPTBL(L(aligned_16_27bytes), L(table_16_128bytes))
396	.int	JMPTBL(L(aligned_16_28bytes), L(table_16_128bytes))
397	.int	JMPTBL(L(aligned_16_29bytes), L(table_16_128bytes))
398	.int	JMPTBL(L(aligned_16_30bytes), L(table_16_128bytes))
399	.int	JMPTBL(L(aligned_16_31bytes), L(table_16_128bytes))
400	.int	JMPTBL(L(aligned_16_32bytes), L(table_16_128bytes))
401	.int	JMPTBL(L(aligned_16_33bytes), L(table_16_128bytes))
402	.int	JMPTBL(L(aligned_16_34bytes), L(table_16_128bytes))
403	.int	JMPTBL(L(aligned_16_35bytes), L(table_16_128bytes))
404	.int	JMPTBL(L(aligned_16_36bytes), L(table_16_128bytes))
405	.int	JMPTBL(L(aligned_16_37bytes), L(table_16_128bytes))
406	.int	JMPTBL(L(aligned_16_38bytes), L(table_16_128bytes))
407	.int	JMPTBL(L(aligned_16_39bytes), L(table_16_128bytes))
408	.int	JMPTBL(L(aligned_16_40bytes), L(table_16_128bytes))
409	.int	JMPTBL(L(aligned_16_41bytes), L(table_16_128bytes))
410	.int	JMPTBL(L(aligned_16_42bytes), L(table_16_128bytes))
411	.int	JMPTBL(L(aligned_16_43bytes), L(table_16_128bytes))
412	.int	JMPTBL(L(aligned_16_44bytes), L(table_16_128bytes))
413	.int	JMPTBL(L(aligned_16_45bytes), L(table_16_128bytes))
414	.int	JMPTBL(L(aligned_16_46bytes), L(table_16_128bytes))
415	.int	JMPTBL(L(aligned_16_47bytes), L(table_16_128bytes))
416	.int	JMPTBL(L(aligned_16_48bytes), L(table_16_128bytes))
417	.int	JMPTBL(L(aligned_16_49bytes), L(table_16_128bytes))
418	.int	JMPTBL(L(aligned_16_50bytes), L(table_16_128bytes))
419	.int	JMPTBL(L(aligned_16_51bytes), L(table_16_128bytes))
420	.int	JMPTBL(L(aligned_16_52bytes), L(table_16_128bytes))
421	.int	JMPTBL(L(aligned_16_53bytes), L(table_16_128bytes))
422	.int	JMPTBL(L(aligned_16_54bytes), L(table_16_128bytes))
423	.int	JMPTBL(L(aligned_16_55bytes), L(table_16_128bytes))
424	.int	JMPTBL(L(aligned_16_56bytes), L(table_16_128bytes))
425	.int	JMPTBL(L(aligned_16_57bytes), L(table_16_128bytes))
426	.int	JMPTBL(L(aligned_16_58bytes), L(table_16_128bytes))
427	.int	JMPTBL(L(aligned_16_59bytes), L(table_16_128bytes))
428	.int	JMPTBL(L(aligned_16_60bytes), L(table_16_128bytes))
429	.int	JMPTBL(L(aligned_16_61bytes), L(table_16_128bytes))
430	.int	JMPTBL(L(aligned_16_62bytes), L(table_16_128bytes))
431	.int	JMPTBL(L(aligned_16_63bytes), L(table_16_128bytes))
432	.int	JMPTBL(L(aligned_16_64bytes), L(table_16_128bytes))
433	.int	JMPTBL(L(aligned_16_65bytes), L(table_16_128bytes))
434	.int	JMPTBL(L(aligned_16_66bytes), L(table_16_128bytes))
435	.int	JMPTBL(L(aligned_16_67bytes), L(table_16_128bytes))
436	.int	JMPTBL(L(aligned_16_68bytes), L(table_16_128bytes))
437	.int	JMPTBL(L(aligned_16_69bytes), L(table_16_128bytes))
438	.int	JMPTBL(L(aligned_16_70bytes), L(table_16_128bytes))
439	.int	JMPTBL(L(aligned_16_71bytes), L(table_16_128bytes))
440	.int	JMPTBL(L(aligned_16_72bytes), L(table_16_128bytes))
441	.int	JMPTBL(L(aligned_16_73bytes), L(table_16_128bytes))
442	.int	JMPTBL(L(aligned_16_74bytes), L(table_16_128bytes))
443	.int	JMPTBL(L(aligned_16_75bytes), L(table_16_128bytes))
444	.int	JMPTBL(L(aligned_16_76bytes), L(table_16_128bytes))
445	.int	JMPTBL(L(aligned_16_77bytes), L(table_16_128bytes))
446	.int	JMPTBL(L(aligned_16_78bytes), L(table_16_128bytes))
447	.int	JMPTBL(L(aligned_16_79bytes), L(table_16_128bytes))
448	.int	JMPTBL(L(aligned_16_80bytes), L(table_16_128bytes))
449	.int	JMPTBL(L(aligned_16_81bytes), L(table_16_128bytes))
450	.int	JMPTBL(L(aligned_16_82bytes), L(table_16_128bytes))
451	.int	JMPTBL(L(aligned_16_83bytes), L(table_16_128bytes))
452	.int	JMPTBL(L(aligned_16_84bytes), L(table_16_128bytes))
453	.int	JMPTBL(L(aligned_16_85bytes), L(table_16_128bytes))
454	.int	JMPTBL(L(aligned_16_86bytes), L(table_16_128bytes))
455	.int	JMPTBL(L(aligned_16_87bytes), L(table_16_128bytes))
456	.int	JMPTBL(L(aligned_16_88bytes), L(table_16_128bytes))
457	.int	JMPTBL(L(aligned_16_89bytes), L(table_16_128bytes))
458	.int	JMPTBL(L(aligned_16_90bytes), L(table_16_128bytes))
459	.int	JMPTBL(L(aligned_16_91bytes), L(table_16_128bytes))
460	.int	JMPTBL(L(aligned_16_92bytes), L(table_16_128bytes))
461	.int	JMPTBL(L(aligned_16_93bytes), L(table_16_128bytes))
462	.int	JMPTBL(L(aligned_16_94bytes), L(table_16_128bytes))
463	.int	JMPTBL(L(aligned_16_95bytes), L(table_16_128bytes))
464	.int	JMPTBL(L(aligned_16_96bytes), L(table_16_128bytes))
465	.int	JMPTBL(L(aligned_16_97bytes), L(table_16_128bytes))
466	.int	JMPTBL(L(aligned_16_98bytes), L(table_16_128bytes))
467	.int	JMPTBL(L(aligned_16_99bytes), L(table_16_128bytes))
468	.int	JMPTBL(L(aligned_16_100bytes), L(table_16_128bytes))
469	.int	JMPTBL(L(aligned_16_101bytes), L(table_16_128bytes))
470	.int	JMPTBL(L(aligned_16_102bytes), L(table_16_128bytes))
471	.int	JMPTBL(L(aligned_16_103bytes), L(table_16_128bytes))
472	.int	JMPTBL(L(aligned_16_104bytes), L(table_16_128bytes))
473	.int	JMPTBL(L(aligned_16_105bytes), L(table_16_128bytes))
474	.int	JMPTBL(L(aligned_16_106bytes), L(table_16_128bytes))
475	.int	JMPTBL(L(aligned_16_107bytes), L(table_16_128bytes))
476	.int	JMPTBL(L(aligned_16_108bytes), L(table_16_128bytes))
477	.int	JMPTBL(L(aligned_16_109bytes), L(table_16_128bytes))
478	.int	JMPTBL(L(aligned_16_110bytes), L(table_16_128bytes))
479	.int	JMPTBL(L(aligned_16_111bytes), L(table_16_128bytes))
480	.int	JMPTBL(L(aligned_16_112bytes), L(table_16_128bytes))
481	.int	JMPTBL(L(aligned_16_113bytes), L(table_16_128bytes))
482	.int	JMPTBL(L(aligned_16_114bytes), L(table_16_128bytes))
483	.int	JMPTBL(L(aligned_16_115bytes), L(table_16_128bytes))
484	.int	JMPTBL(L(aligned_16_116bytes), L(table_16_128bytes))
485	.int	JMPTBL(L(aligned_16_117bytes), L(table_16_128bytes))
486	.int	JMPTBL(L(aligned_16_118bytes), L(table_16_128bytes))
487	.int	JMPTBL(L(aligned_16_119bytes), L(table_16_128bytes))
488	.int	JMPTBL(L(aligned_16_120bytes), L(table_16_128bytes))
489	.int	JMPTBL(L(aligned_16_121bytes), L(table_16_128bytes))
490	.int	JMPTBL(L(aligned_16_122bytes), L(table_16_128bytes))
491	.int	JMPTBL(L(aligned_16_123bytes), L(table_16_128bytes))
492	.int	JMPTBL(L(aligned_16_124bytes), L(table_16_128bytes))
493	.int	JMPTBL(L(aligned_16_125bytes), L(table_16_128bytes))
494	.int	JMPTBL(L(aligned_16_126bytes), L(table_16_128bytes))
495	.int	JMPTBL(L(aligned_16_127bytes), L(table_16_128bytes))
496	.popsection
497
498	ALIGN(4)
499L(aligned_16_112bytes):
500	movdqa	%xmm0, -112(%edx)
501L(aligned_16_96bytes):
502	movdqa	%xmm0, -96(%edx)
503L(aligned_16_80bytes):
504	movdqa	%xmm0, -80(%edx)
505L(aligned_16_64bytes):
506	movdqa	%xmm0, -64(%edx)
507L(aligned_16_48bytes):
508	movdqa	%xmm0, -48(%edx)
509L(aligned_16_32bytes):
510	movdqa	%xmm0, -32(%edx)
511L(aligned_16_16bytes):
512	movdqa	%xmm0, -16(%edx)
513L(aligned_16_0bytes):
514	SETRTNVAL
515	RETURN
516
517	ALIGN(4)
518L(aligned_16_113bytes):
519	movdqa	%xmm0, -113(%edx)
520L(aligned_16_97bytes):
521	movdqa	%xmm0, -97(%edx)
522L(aligned_16_81bytes):
523	movdqa	%xmm0, -81(%edx)
524L(aligned_16_65bytes):
525	movdqa	%xmm0, -65(%edx)
526L(aligned_16_49bytes):
527	movdqa	%xmm0, -49(%edx)
528L(aligned_16_33bytes):
529	movdqa	%xmm0, -33(%edx)
530L(aligned_16_17bytes):
531	movdqa	%xmm0, -17(%edx)
532L(aligned_16_1bytes):
533	movb	%al, -1(%edx)
534	SETRTNVAL
535	RETURN
536
537	ALIGN(4)
538L(aligned_16_114bytes):
539	movdqa	%xmm0, -114(%edx)
540L(aligned_16_98bytes):
541	movdqa	%xmm0, -98(%edx)
542L(aligned_16_82bytes):
543	movdqa	%xmm0, -82(%edx)
544L(aligned_16_66bytes):
545	movdqa	%xmm0, -66(%edx)
546L(aligned_16_50bytes):
547	movdqa	%xmm0, -50(%edx)
548L(aligned_16_34bytes):
549	movdqa	%xmm0, -34(%edx)
550L(aligned_16_18bytes):
551	movdqa	%xmm0, -18(%edx)
552L(aligned_16_2bytes):
553	movw	%ax, -2(%edx)
554	SETRTNVAL
555	RETURN
556
557	ALIGN(4)
558L(aligned_16_115bytes):
559	movdqa	%xmm0, -115(%edx)
560L(aligned_16_99bytes):
561	movdqa	%xmm0, -99(%edx)
562L(aligned_16_83bytes):
563	movdqa	%xmm0, -83(%edx)
564L(aligned_16_67bytes):
565	movdqa	%xmm0, -67(%edx)
566L(aligned_16_51bytes):
567	movdqa	%xmm0, -51(%edx)
568L(aligned_16_35bytes):
569	movdqa	%xmm0, -35(%edx)
570L(aligned_16_19bytes):
571	movdqa	%xmm0, -19(%edx)
572L(aligned_16_3bytes):
573	movw	%ax, -3(%edx)
574	movb	%al, -1(%edx)
575	SETRTNVAL
576	RETURN
577
578	ALIGN(4)
579L(aligned_16_116bytes):
580	movdqa	%xmm0, -116(%edx)
581L(aligned_16_100bytes):
582	movdqa	%xmm0, -100(%edx)
583L(aligned_16_84bytes):
584	movdqa	%xmm0, -84(%edx)
585L(aligned_16_68bytes):
586	movdqa	%xmm0, -68(%edx)
587L(aligned_16_52bytes):
588	movdqa	%xmm0, -52(%edx)
589L(aligned_16_36bytes):
590	movdqa	%xmm0, -36(%edx)
591L(aligned_16_20bytes):
592	movdqa	%xmm0, -20(%edx)
593L(aligned_16_4bytes):
594	movl	%eax, -4(%edx)
595	SETRTNVAL
596	RETURN
597
598	ALIGN(4)
599L(aligned_16_117bytes):
600	movdqa	%xmm0, -117(%edx)
601L(aligned_16_101bytes):
602	movdqa	%xmm0, -101(%edx)
603L(aligned_16_85bytes):
604	movdqa	%xmm0, -85(%edx)
605L(aligned_16_69bytes):
606	movdqa	%xmm0, -69(%edx)
607L(aligned_16_53bytes):
608	movdqa	%xmm0, -53(%edx)
609L(aligned_16_37bytes):
610	movdqa	%xmm0, -37(%edx)
611L(aligned_16_21bytes):
612	movdqa	%xmm0, -21(%edx)
613L(aligned_16_5bytes):
614	movl	%eax, -5(%edx)
615	movb	%al, -1(%edx)
616	SETRTNVAL
617	RETURN
618
619	ALIGN(4)
620L(aligned_16_118bytes):
621	movdqa	%xmm0, -118(%edx)
622L(aligned_16_102bytes):
623	movdqa	%xmm0, -102(%edx)
624L(aligned_16_86bytes):
625	movdqa	%xmm0, -86(%edx)
626L(aligned_16_70bytes):
627	movdqa	%xmm0, -70(%edx)
628L(aligned_16_54bytes):
629	movdqa	%xmm0, -54(%edx)
630L(aligned_16_38bytes):
631	movdqa	%xmm0, -38(%edx)
632L(aligned_16_22bytes):
633	movdqa	%xmm0, -22(%edx)
634L(aligned_16_6bytes):
635	movl	%eax, -6(%edx)
636	movw	%ax, -2(%edx)
637	SETRTNVAL
638	RETURN
639
640	ALIGN(4)
641L(aligned_16_119bytes):
642	movdqa	%xmm0, -119(%edx)
643L(aligned_16_103bytes):
644	movdqa	%xmm0, -103(%edx)
645L(aligned_16_87bytes):
646	movdqa	%xmm0, -87(%edx)
647L(aligned_16_71bytes):
648	movdqa	%xmm0, -71(%edx)
649L(aligned_16_55bytes):
650	movdqa	%xmm0, -55(%edx)
651L(aligned_16_39bytes):
652	movdqa	%xmm0, -39(%edx)
653L(aligned_16_23bytes):
654	movdqa	%xmm0, -23(%edx)
655L(aligned_16_7bytes):
656	movl	%eax, -7(%edx)
657	movw	%ax, -3(%edx)
658	movb	%al, -1(%edx)
659	SETRTNVAL
660	RETURN
661
662	ALIGN(4)
663L(aligned_16_120bytes):
664	movdqa	%xmm0, -120(%edx)
665L(aligned_16_104bytes):
666	movdqa	%xmm0, -104(%edx)
667L(aligned_16_88bytes):
668	movdqa	%xmm0, -88(%edx)
669L(aligned_16_72bytes):
670	movdqa	%xmm0, -72(%edx)
671L(aligned_16_56bytes):
672	movdqa	%xmm0, -56(%edx)
673L(aligned_16_40bytes):
674	movdqa	%xmm0, -40(%edx)
675L(aligned_16_24bytes):
676	movdqa	%xmm0, -24(%edx)
677L(aligned_16_8bytes):
678	movq	%xmm0, -8(%edx)
679	SETRTNVAL
680	RETURN
681
682	ALIGN(4)
683L(aligned_16_121bytes):
684	movdqa	%xmm0, -121(%edx)
685L(aligned_16_105bytes):
686	movdqa	%xmm0, -105(%edx)
687L(aligned_16_89bytes):
688	movdqa	%xmm0, -89(%edx)
689L(aligned_16_73bytes):
690	movdqa	%xmm0, -73(%edx)
691L(aligned_16_57bytes):
692	movdqa	%xmm0, -57(%edx)
693L(aligned_16_41bytes):
694	movdqa	%xmm0, -41(%edx)
695L(aligned_16_25bytes):
696	movdqa	%xmm0, -25(%edx)
697L(aligned_16_9bytes):
698	movq	%xmm0, -9(%edx)
699	movb	%al, -1(%edx)
700	SETRTNVAL
701	RETURN
702
703	ALIGN(4)
704L(aligned_16_122bytes):
705	movdqa	%xmm0, -122(%edx)
706L(aligned_16_106bytes):
707	movdqa	%xmm0, -106(%edx)
708L(aligned_16_90bytes):
709	movdqa	%xmm0, -90(%edx)
710L(aligned_16_74bytes):
711	movdqa	%xmm0, -74(%edx)
712L(aligned_16_58bytes):
713	movdqa	%xmm0, -58(%edx)
714L(aligned_16_42bytes):
715	movdqa	%xmm0, -42(%edx)
716L(aligned_16_26bytes):
717	movdqa	%xmm0, -26(%edx)
718L(aligned_16_10bytes):
719	movq	%xmm0, -10(%edx)
720	movw	%ax, -2(%edx)
721	SETRTNVAL
722	RETURN
723
724	ALIGN(4)
725L(aligned_16_123bytes):
726	movdqa	%xmm0, -123(%edx)
727L(aligned_16_107bytes):
728	movdqa	%xmm0, -107(%edx)
729L(aligned_16_91bytes):
730	movdqa	%xmm0, -91(%edx)
731L(aligned_16_75bytes):
732	movdqa	%xmm0, -75(%edx)
733L(aligned_16_59bytes):
734	movdqa	%xmm0, -59(%edx)
735L(aligned_16_43bytes):
736	movdqa	%xmm0, -43(%edx)
737L(aligned_16_27bytes):
738	movdqa	%xmm0, -27(%edx)
739L(aligned_16_11bytes):
740	movq	%xmm0, -11(%edx)
741	movw	%ax, -3(%edx)
742	movb	%al, -1(%edx)
743	SETRTNVAL
744	RETURN
745
746	ALIGN(4)
747L(aligned_16_124bytes):
748	movdqa	%xmm0, -124(%edx)
749L(aligned_16_108bytes):
750	movdqa	%xmm0, -108(%edx)
751L(aligned_16_92bytes):
752	movdqa	%xmm0, -92(%edx)
753L(aligned_16_76bytes):
754	movdqa	%xmm0, -76(%edx)
755L(aligned_16_60bytes):
756	movdqa	%xmm0, -60(%edx)
757L(aligned_16_44bytes):
758	movdqa	%xmm0, -44(%edx)
759L(aligned_16_28bytes):
760	movdqa	%xmm0, -28(%edx)
761L(aligned_16_12bytes):
762	movq	%xmm0, -12(%edx)
763	movl	%eax, -4(%edx)
764	SETRTNVAL
765	RETURN
766
767	ALIGN(4)
768L(aligned_16_125bytes):
769	movdqa	%xmm0, -125(%edx)
770L(aligned_16_109bytes):
771	movdqa	%xmm0, -109(%edx)
772L(aligned_16_93bytes):
773	movdqa	%xmm0, -93(%edx)
774L(aligned_16_77bytes):
775	movdqa	%xmm0, -77(%edx)
776L(aligned_16_61bytes):
777	movdqa	%xmm0, -61(%edx)
778L(aligned_16_45bytes):
779	movdqa	%xmm0, -45(%edx)
780L(aligned_16_29bytes):
781	movdqa	%xmm0, -29(%edx)
782L(aligned_16_13bytes):
783	movq	%xmm0, -13(%edx)
784	movl	%eax, -5(%edx)
785	movb	%al, -1(%edx)
786	SETRTNVAL
787	RETURN
788
789	ALIGN(4)
790L(aligned_16_126bytes):
791	movdqa	%xmm0, -126(%edx)
792L(aligned_16_110bytes):
793	movdqa	%xmm0, -110(%edx)
794L(aligned_16_94bytes):
795	movdqa	%xmm0, -94(%edx)
796L(aligned_16_78bytes):
797	movdqa	%xmm0, -78(%edx)
798L(aligned_16_62bytes):
799	movdqa	%xmm0, -62(%edx)
800L(aligned_16_46bytes):
801	movdqa	%xmm0, -46(%edx)
802L(aligned_16_30bytes):
803	movdqa	%xmm0, -30(%edx)
804L(aligned_16_14bytes):
805	movq	%xmm0, -14(%edx)
806	movl	%eax, -6(%edx)
807	movw	%ax, -2(%edx)
808	SETRTNVAL
809	RETURN
810
811	ALIGN(4)
812L(aligned_16_127bytes):
813	movdqa	%xmm0, -127(%edx)
814L(aligned_16_111bytes):
815	movdqa	%xmm0, -111(%edx)
816L(aligned_16_95bytes):
817	movdqa	%xmm0, -95(%edx)
818L(aligned_16_79bytes):
819	movdqa	%xmm0, -79(%edx)
820L(aligned_16_63bytes):
821	movdqa	%xmm0, -63(%edx)
822L(aligned_16_47bytes):
823	movdqa	%xmm0, -47(%edx)
824L(aligned_16_31bytes):
825	movdqa	%xmm0, -31(%edx)
826L(aligned_16_15bytes):
827	movq	%xmm0, -15(%edx)
828	movl	%eax, -7(%edx)
829	movw	%ax, -3(%edx)
830	movb	%al, -1(%edx)
831	SETRTNVAL
832	RETURN_END
833
834END(memset_atom)
835