1; NOTE: Assertions have been autogenerated by update_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
3; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
4; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
5; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7
8; https://llvm.org/bugs/show_bug.cgi?id=27100
9
10define void @memset_16_nonzero_bytes(i8* %x) {
11; SSE-LABEL: memset_16_nonzero_bytes:
12; SSE:       # BB#0:
13; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
14; SSE-NEXT:    movq %rax, 8(%rdi)
15; SSE-NEXT:    movq %rax, (%rdi)
16; SSE-NEXT:    retq
17;
18; SSE2FAST-LABEL: memset_16_nonzero_bytes:
19; SSE2FAST:       # BB#0:
20; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
21; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
22; SSE2FAST-NEXT:    retq
23;
24; AVX-LABEL: memset_16_nonzero_bytes:
25; AVX:       # BB#0:
26; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
27; AVX-NEXT:    vmovups %xmm0, (%rdi)
28; AVX-NEXT:    retq
29;
30  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
31  ret void
32}
33
34define void @memset_32_nonzero_bytes(i8* %x) {
35; SSE-LABEL: memset_32_nonzero_bytes:
36; SSE:       # BB#0:
37; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
38; SSE-NEXT:    movq %rax, 24(%rdi)
39; SSE-NEXT:    movq %rax, 16(%rdi)
40; SSE-NEXT:    movq %rax, 8(%rdi)
41; SSE-NEXT:    movq %rax, (%rdi)
42; SSE-NEXT:    retq
43;
44; SSE2FAST-LABEL: memset_32_nonzero_bytes:
45; SSE2FAST:       # BB#0:
46; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
47; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
48; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
49; SSE2FAST-NEXT:    retq
50;
51; AVX-LABEL: memset_32_nonzero_bytes:
52; AVX:       # BB#0:
53; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
54; AVX-NEXT:    vmovups %ymm0, (%rdi)
55; AVX-NEXT:    vzeroupper
56; AVX-NEXT:    retq
57;
58  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
59  ret void
60}
61
62define void @memset_64_nonzero_bytes(i8* %x) {
63; SSE-LABEL: memset_64_nonzero_bytes:
64; SSE:       # BB#0:
65; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
66; SSE-NEXT:    movq %rax, 56(%rdi)
67; SSE-NEXT:    movq %rax, 48(%rdi)
68; SSE-NEXT:    movq %rax, 40(%rdi)
69; SSE-NEXT:    movq %rax, 32(%rdi)
70; SSE-NEXT:    movq %rax, 24(%rdi)
71; SSE-NEXT:    movq %rax, 16(%rdi)
72; SSE-NEXT:    movq %rax, 8(%rdi)
73; SSE-NEXT:    movq %rax, (%rdi)
74; SSE-NEXT:    retq
75;
76; SSE2FAST-LABEL: memset_64_nonzero_bytes:
77; SSE2FAST:       # BB#0:
78; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
79; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
80; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
81; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
82; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
83; SSE2FAST-NEXT:    retq
84;
85; AVX-LABEL: memset_64_nonzero_bytes:
86; AVX:       # BB#0:
87; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
88; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
89; AVX-NEXT:    vmovups %ymm0, (%rdi)
90; AVX-NEXT:    vzeroupper
91; AVX-NEXT:    retq
92;
93  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
94  ret void
95}
96
97define void @memset_128_nonzero_bytes(i8* %x) {
98; SSE-LABEL: memset_128_nonzero_bytes:
99; SSE:       # BB#0:
100; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
101; SSE-NEXT:    movq %rax, 120(%rdi)
102; SSE-NEXT:    movq %rax, 112(%rdi)
103; SSE-NEXT:    movq %rax, 104(%rdi)
104; SSE-NEXT:    movq %rax, 96(%rdi)
105; SSE-NEXT:    movq %rax, 88(%rdi)
106; SSE-NEXT:    movq %rax, 80(%rdi)
107; SSE-NEXT:    movq %rax, 72(%rdi)
108; SSE-NEXT:    movq %rax, 64(%rdi)
109; SSE-NEXT:    movq %rax, 56(%rdi)
110; SSE-NEXT:    movq %rax, 48(%rdi)
111; SSE-NEXT:    movq %rax, 40(%rdi)
112; SSE-NEXT:    movq %rax, 32(%rdi)
113; SSE-NEXT:    movq %rax, 24(%rdi)
114; SSE-NEXT:    movq %rax, 16(%rdi)
115; SSE-NEXT:    movq %rax, 8(%rdi)
116; SSE-NEXT:    movq %rax, (%rdi)
117; SSE-NEXT:    retq
118;
119; SSE2FAST-LABEL: memset_128_nonzero_bytes:
120; SSE2FAST:       # BB#0:
121; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
122; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
123; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
124; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
125; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
126; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
127; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
128; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
129; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
130; SSE2FAST-NEXT:    retq
131;
132; AVX-LABEL: memset_128_nonzero_bytes:
133; AVX:       # BB#0:
134; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
135; AVX-NEXT:    vmovups %ymm0, 96(%rdi)
136; AVX-NEXT:    vmovups %ymm0, 64(%rdi)
137; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
138; AVX-NEXT:    vmovups %ymm0, (%rdi)
139; AVX-NEXT:    vzeroupper
140; AVX-NEXT:    retq
141;
142  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
143  ret void
144}
145
146define void @memset_256_nonzero_bytes(i8* %x) {
147; SSE-LABEL: memset_256_nonzero_bytes:
148; SSE:       # BB#0:
149; SSE-NEXT:    pushq %rax
150; SSE-NEXT:  .Ltmp0:
151; SSE-NEXT:    .cfi_def_cfa_offset 16
152; SSE-NEXT:    movl $42, %esi
153; SSE-NEXT:    movl $256, %edx # imm = 0x100
154; SSE-NEXT:    callq memset
155; SSE-NEXT:    popq %rax
156; SSE-NEXT:    retq
157;
158; SSE2FAST-LABEL: memset_256_nonzero_bytes:
159; SSE2FAST:       # BB#0:
160; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
161; SSE2FAST-NEXT:    movups %xmm0, 240(%rdi)
162; SSE2FAST-NEXT:    movups %xmm0, 224(%rdi)
163; SSE2FAST-NEXT:    movups %xmm0, 208(%rdi)
164; SSE2FAST-NEXT:    movups %xmm0, 192(%rdi)
165; SSE2FAST-NEXT:    movups %xmm0, 176(%rdi)
166; SSE2FAST-NEXT:    movups %xmm0, 160(%rdi)
167; SSE2FAST-NEXT:    movups %xmm0, 144(%rdi)
168; SSE2FAST-NEXT:    movups %xmm0, 128(%rdi)
169; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
170; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
171; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
172; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
173; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
174; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
175; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
176; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
177; SSE2FAST-NEXT:    retq
178;
179; AVX-LABEL: memset_256_nonzero_bytes:
180; AVX:       # BB#0:
181; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
182; AVX-NEXT:    vmovups %ymm0, 224(%rdi)
183; AVX-NEXT:    vmovups %ymm0, 192(%rdi)
184; AVX-NEXT:    vmovups %ymm0, 160(%rdi)
185; AVX-NEXT:    vmovups %ymm0, 128(%rdi)
186; AVX-NEXT:    vmovups %ymm0, 96(%rdi)
187; AVX-NEXT:    vmovups %ymm0, 64(%rdi)
188; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
189; AVX-NEXT:    vmovups %ymm0, (%rdi)
190; AVX-NEXT:    vzeroupper
191; AVX-NEXT:    retq
192;
193  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
194  ret void
195}
196
197declare i8* @__memset_chk(i8*, i32, i64, i64)
198
199; Repeat with a non-constant value for the stores.
200
201define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
202; SSE-LABEL: memset_16_nonconst_bytes:
203; SSE:       # BB#0:
204; SSE-NEXT:    movzbl %sil, %eax
205; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
206; SSE-NEXT:    imulq %rax, %rcx
207; SSE-NEXT:    movq %rcx, 8(%rdi)
208; SSE-NEXT:    movq %rcx, (%rdi)
209; SSE-NEXT:    retq
210;
211; SSE2FAST-LABEL: memset_16_nonconst_bytes:
212; SSE2FAST:       # BB#0:
213; SSE2FAST-NEXT:    movd %esi, %xmm0
214; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
215; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
216; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
217; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
218; SSE2FAST-NEXT:    retq
219;
220; AVX1-LABEL: memset_16_nonconst_bytes:
221; AVX1:       # BB#0:
222; AVX1-NEXT:    vmovd %esi, %xmm0
223; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
224; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
225; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
226; AVX1-NEXT:    retq
227;
228; AVX2-LABEL: memset_16_nonconst_bytes:
229; AVX2:       # BB#0:
230; AVX2-NEXT:    vmovd %esi, %xmm0
231; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
232; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
233; AVX2-NEXT:    retq
234;
235  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false)
236  ret void
237}
238
239define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
240; SSE-LABEL: memset_32_nonconst_bytes:
241; SSE:       # BB#0:
242; SSE-NEXT:    movzbl %sil, %eax
243; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
244; SSE-NEXT:    imulq %rax, %rcx
245; SSE-NEXT:    movq %rcx, 24(%rdi)
246; SSE-NEXT:    movq %rcx, 16(%rdi)
247; SSE-NEXT:    movq %rcx, 8(%rdi)
248; SSE-NEXT:    movq %rcx, (%rdi)
249; SSE-NEXT:    retq
250;
251; SSE2FAST-LABEL: memset_32_nonconst_bytes:
252; SSE2FAST:       # BB#0:
253; SSE2FAST-NEXT:    movd %esi, %xmm0
254; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
255; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
256; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
257; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
258; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
259; SSE2FAST-NEXT:    retq
260;
261; AVX1-LABEL: memset_32_nonconst_bytes:
262; AVX1:       # BB#0:
263; AVX1-NEXT:    vmovd %esi, %xmm0
264; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
265; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
266; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
267; AVX1-NEXT:    vmovups %ymm0, (%rdi)
268; AVX1-NEXT:    vzeroupper
269; AVX1-NEXT:    retq
270;
271; AVX2-LABEL: memset_32_nonconst_bytes:
272; AVX2:       # BB#0:
273; AVX2-NEXT:    vmovd %esi, %xmm0
274; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
275; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
276; AVX2-NEXT:    vzeroupper
277; AVX2-NEXT:    retq
278;
279  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i32 1, i1 false)
280  ret void
281}
282
283define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
284; SSE-LABEL: memset_64_nonconst_bytes:
285; SSE:       # BB#0:
286; SSE-NEXT:    movzbl %sil, %eax
287; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
288; SSE-NEXT:    imulq %rax, %rcx
289; SSE-NEXT:    movq %rcx, 56(%rdi)
290; SSE-NEXT:    movq %rcx, 48(%rdi)
291; SSE-NEXT:    movq %rcx, 40(%rdi)
292; SSE-NEXT:    movq %rcx, 32(%rdi)
293; SSE-NEXT:    movq %rcx, 24(%rdi)
294; SSE-NEXT:    movq %rcx, 16(%rdi)
295; SSE-NEXT:    movq %rcx, 8(%rdi)
296; SSE-NEXT:    movq %rcx, (%rdi)
297; SSE-NEXT:    retq
298;
299; SSE2FAST-LABEL: memset_64_nonconst_bytes:
300; SSE2FAST:       # BB#0:
301; SSE2FAST-NEXT:    movd %esi, %xmm0
302; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
303; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
304; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
305; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
306; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
307; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
308; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
309; SSE2FAST-NEXT:    retq
310;
311; AVX1-LABEL: memset_64_nonconst_bytes:
312; AVX1:       # BB#0:
313; AVX1-NEXT:    vmovd %esi, %xmm0
314; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
315; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
316; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
317; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
318; AVX1-NEXT:    vmovups %ymm0, (%rdi)
319; AVX1-NEXT:    vzeroupper
320; AVX1-NEXT:    retq
321;
322; AVX2-LABEL: memset_64_nonconst_bytes:
323; AVX2:       # BB#0:
324; AVX2-NEXT:    vmovd %esi, %xmm0
325; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
326; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
327; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
328; AVX2-NEXT:    vzeroupper
329; AVX2-NEXT:    retq
330;
331  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i32 1, i1 false)
332  ret void
333}
334
335define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
336; SSE-LABEL: memset_128_nonconst_bytes:
337; SSE:       # BB#0:
338; SSE-NEXT:    movzbl %sil, %eax
339; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
340; SSE-NEXT:    imulq %rax, %rcx
341; SSE-NEXT:    movq %rcx, 120(%rdi)
342; SSE-NEXT:    movq %rcx, 112(%rdi)
343; SSE-NEXT:    movq %rcx, 104(%rdi)
344; SSE-NEXT:    movq %rcx, 96(%rdi)
345; SSE-NEXT:    movq %rcx, 88(%rdi)
346; SSE-NEXT:    movq %rcx, 80(%rdi)
347; SSE-NEXT:    movq %rcx, 72(%rdi)
348; SSE-NEXT:    movq %rcx, 64(%rdi)
349; SSE-NEXT:    movq %rcx, 56(%rdi)
350; SSE-NEXT:    movq %rcx, 48(%rdi)
351; SSE-NEXT:    movq %rcx, 40(%rdi)
352; SSE-NEXT:    movq %rcx, 32(%rdi)
353; SSE-NEXT:    movq %rcx, 24(%rdi)
354; SSE-NEXT:    movq %rcx, 16(%rdi)
355; SSE-NEXT:    movq %rcx, 8(%rdi)
356; SSE-NEXT:    movq %rcx, (%rdi)
357; SSE-NEXT:    retq
358;
359; SSE2FAST-LABEL: memset_128_nonconst_bytes:
360; SSE2FAST:       # BB#0:
361; SSE2FAST-NEXT:    movd %esi, %xmm0
362; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
363; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
364; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
365; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
366; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
367; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
368; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
369; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
370; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
371; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
372; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
373; SSE2FAST-NEXT:    retq
374;
375; AVX1-LABEL: memset_128_nonconst_bytes:
376; AVX1:       # BB#0:
377; AVX1-NEXT:    vmovd %esi, %xmm0
378; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
379; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
380; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
381; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
382; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
383; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
384; AVX1-NEXT:    vmovups %ymm0, (%rdi)
385; AVX1-NEXT:    vzeroupper
386; AVX1-NEXT:    retq
387;
388; AVX2-LABEL: memset_128_nonconst_bytes:
389; AVX2:       # BB#0:
390; AVX2-NEXT:    vmovd %esi, %xmm0
391; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
392; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
393; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
394; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
395; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
396; AVX2-NEXT:    vzeroupper
397; AVX2-NEXT:    retq
398;
399  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i32 1, i1 false)
400  ret void
401}
402
403define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
404; SSE-LABEL: memset_256_nonconst_bytes:
405; SSE:       # BB#0:
406; SSE-NEXT:    movl $256, %edx # imm = 0x100
407; SSE-NEXT:    jmp memset # TAILCALL
408;
409; SSE2FAST-LABEL: memset_256_nonconst_bytes:
410; SSE2FAST:       # BB#0:
411; SSE2FAST-NEXT:    movd %esi, %xmm0
412; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
413; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
414; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
415; SSE2FAST-NEXT:    movdqu %xmm0, 240(%rdi)
416; SSE2FAST-NEXT:    movdqu %xmm0, 224(%rdi)
417; SSE2FAST-NEXT:    movdqu %xmm0, 208(%rdi)
418; SSE2FAST-NEXT:    movdqu %xmm0, 192(%rdi)
419; SSE2FAST-NEXT:    movdqu %xmm0, 176(%rdi)
420; SSE2FAST-NEXT:    movdqu %xmm0, 160(%rdi)
421; SSE2FAST-NEXT:    movdqu %xmm0, 144(%rdi)
422; SSE2FAST-NEXT:    movdqu %xmm0, 128(%rdi)
423; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
424; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
425; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
426; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
427; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
428; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
429; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
430; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
431; SSE2FAST-NEXT:    retq
432;
433; AVX1-LABEL: memset_256_nonconst_bytes:
434; AVX1:       # BB#0:
435; AVX1-NEXT:    vmovd %esi, %xmm0
436; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
437; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
438; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
439; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
440; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
441; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
442; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
443; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
444; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
445; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
446; AVX1-NEXT:    vmovups %ymm0, (%rdi)
447; AVX1-NEXT:    vzeroupper
448; AVX1-NEXT:    retq
449;
450; AVX2-LABEL: memset_256_nonconst_bytes:
451; AVX2:       # BB#0:
452; AVX2-NEXT:    vmovd %esi, %xmm0
453; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
454; AVX2-NEXT:    vmovdqu %ymm0, 224(%rdi)
455; AVX2-NEXT:    vmovdqu %ymm0, 192(%rdi)
456; AVX2-NEXT:    vmovdqu %ymm0, 160(%rdi)
457; AVX2-NEXT:    vmovdqu %ymm0, 128(%rdi)
458; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
459; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
460; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
461; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
462; AVX2-NEXT:    vzeroupper
463; AVX2-NEXT:    retq
464;
465  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i32 1, i1 false)
466  ret void
467}
468
469declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
470
471