1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=SSE,SSE4A
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
9
10; Test codegen for under aligned nontemporal vector stores
11
12; XMM versions.
13
14define void @test_zero_v2f64_align1(<2 x double>* %dst) nounwind {
15; SSE-LABEL: test_zero_v2f64_align1:
16; SSE:       # %bb.0:
17; SSE-NEXT:    xorl %eax, %eax
18; SSE-NEXT:    movntiq %rax, 8(%rdi)
19; SSE-NEXT:    movntiq %rax, (%rdi)
20; SSE-NEXT:    retq
21;
22; AVX-LABEL: test_zero_v2f64_align1:
23; AVX:       # %bb.0:
24; AVX-NEXT:    xorl %eax, %eax
25; AVX-NEXT:    movntiq %rax, 8(%rdi)
26; AVX-NEXT:    movntiq %rax, (%rdi)
27; AVX-NEXT:    retq
28;
29; AVX512-LABEL: test_zero_v2f64_align1:
30; AVX512:       # %bb.0:
31; AVX512-NEXT:    xorl %eax, %eax
32; AVX512-NEXT:    movntiq %rax, 8(%rdi)
33; AVX512-NEXT:    movntiq %rax, (%rdi)
34; AVX512-NEXT:    retq
35  store <2 x double> zeroinitializer, <2 x double>* %dst, align 1, !nontemporal !1
36  ret void
37}
38
39define void @test_zero_v4f32_align1(<4 x float>* %dst) nounwind {
40; SSE-LABEL: test_zero_v4f32_align1:
41; SSE:       # %bb.0:
42; SSE-NEXT:    xorl %eax, %eax
43; SSE-NEXT:    movntiq %rax, 8(%rdi)
44; SSE-NEXT:    movntiq %rax, (%rdi)
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: test_zero_v4f32_align1:
48; AVX:       # %bb.0:
49; AVX-NEXT:    xorl %eax, %eax
50; AVX-NEXT:    movntiq %rax, 8(%rdi)
51; AVX-NEXT:    movntiq %rax, (%rdi)
52; AVX-NEXT:    retq
53;
54; AVX512-LABEL: test_zero_v4f32_align1:
55; AVX512:       # %bb.0:
56; AVX512-NEXT:    xorl %eax, %eax
57; AVX512-NEXT:    movntiq %rax, 8(%rdi)
58; AVX512-NEXT:    movntiq %rax, (%rdi)
59; AVX512-NEXT:    retq
60  store <4 x float> zeroinitializer, <4 x float>* %dst, align 1, !nontemporal !1
61  ret void
62}
63
64define void @test_zero_v2i64_align1(<2 x i64>* %dst) nounwind {
65; SSE-LABEL: test_zero_v2i64_align1:
66; SSE:       # %bb.0:
67; SSE-NEXT:    xorl %eax, %eax
68; SSE-NEXT:    movntiq %rax, 8(%rdi)
69; SSE-NEXT:    movntiq %rax, (%rdi)
70; SSE-NEXT:    retq
71;
72; AVX-LABEL: test_zero_v2i64_align1:
73; AVX:       # %bb.0:
74; AVX-NEXT:    xorl %eax, %eax
75; AVX-NEXT:    movntiq %rax, 8(%rdi)
76; AVX-NEXT:    movntiq %rax, (%rdi)
77; AVX-NEXT:    retq
78;
79; AVX512-LABEL: test_zero_v2i64_align1:
80; AVX512:       # %bb.0:
81; AVX512-NEXT:    xorl %eax, %eax
82; AVX512-NEXT:    movntiq %rax, 8(%rdi)
83; AVX512-NEXT:    movntiq %rax, (%rdi)
84; AVX512-NEXT:    retq
85  store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 1, !nontemporal !1
86  ret void
87}
88
89define void @test_zero_v4i32_align1(<4 x i32>* %dst) nounwind {
90; SSE-LABEL: test_zero_v4i32_align1:
91; SSE:       # %bb.0:
92; SSE-NEXT:    xorl %eax, %eax
93; SSE-NEXT:    movntiq %rax, 8(%rdi)
94; SSE-NEXT:    movntiq %rax, (%rdi)
95; SSE-NEXT:    retq
96;
97; AVX-LABEL: test_zero_v4i32_align1:
98; AVX:       # %bb.0:
99; AVX-NEXT:    xorl %eax, %eax
100; AVX-NEXT:    movntiq %rax, 8(%rdi)
101; AVX-NEXT:    movntiq %rax, (%rdi)
102; AVX-NEXT:    retq
103;
104; AVX512-LABEL: test_zero_v4i32_align1:
105; AVX512:       # %bb.0:
106; AVX512-NEXT:    xorl %eax, %eax
107; AVX512-NEXT:    movntiq %rax, 8(%rdi)
108; AVX512-NEXT:    movntiq %rax, (%rdi)
109; AVX512-NEXT:    retq
110  store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 1, !nontemporal !1
111  ret void
112}
113
114define void @test_zero_v8i16_align1(<8 x i16>* %dst) nounwind {
115; SSE-LABEL: test_zero_v8i16_align1:
116; SSE:       # %bb.0:
117; SSE-NEXT:    xorl %eax, %eax
118; SSE-NEXT:    movntiq %rax, 8(%rdi)
119; SSE-NEXT:    movntiq %rax, (%rdi)
120; SSE-NEXT:    retq
121;
122; AVX-LABEL: test_zero_v8i16_align1:
123; AVX:       # %bb.0:
124; AVX-NEXT:    xorl %eax, %eax
125; AVX-NEXT:    movntiq %rax, 8(%rdi)
126; AVX-NEXT:    movntiq %rax, (%rdi)
127; AVX-NEXT:    retq
128;
129; AVX512-LABEL: test_zero_v8i16_align1:
130; AVX512:       # %bb.0:
131; AVX512-NEXT:    xorl %eax, %eax
132; AVX512-NEXT:    movntiq %rax, 8(%rdi)
133; AVX512-NEXT:    movntiq %rax, (%rdi)
134; AVX512-NEXT:    retq
135  store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 1, !nontemporal !1
136  ret void
137}
138
139define void @test_zero_v16i8_align1(<16 x i8>* %dst) nounwind {
140; SSE-LABEL: test_zero_v16i8_align1:
141; SSE:       # %bb.0:
142; SSE-NEXT:    xorl %eax, %eax
143; SSE-NEXT:    movntiq %rax, 8(%rdi)
144; SSE-NEXT:    movntiq %rax, (%rdi)
145; SSE-NEXT:    retq
146;
147; AVX-LABEL: test_zero_v16i8_align1:
148; AVX:       # %bb.0:
149; AVX-NEXT:    xorl %eax, %eax
150; AVX-NEXT:    movntiq %rax, 8(%rdi)
151; AVX-NEXT:    movntiq %rax, (%rdi)
152; AVX-NEXT:    retq
153;
154; AVX512-LABEL: test_zero_v16i8_align1:
155; AVX512:       # %bb.0:
156; AVX512-NEXT:    xorl %eax, %eax
157; AVX512-NEXT:    movntiq %rax, 8(%rdi)
158; AVX512-NEXT:    movntiq %rax, (%rdi)
159; AVX512-NEXT:    retq
160  store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 1, !nontemporal !1
161  ret void
162}
163
164; YMM versions.
165
166define void @test_zero_v4f64_align1(<4 x double>* %dst) nounwind {
167; SSE-LABEL: test_zero_v4f64_align1:
168; SSE:       # %bb.0:
169; SSE-NEXT:    xorl %eax, %eax
170; SSE-NEXT:    movntiq %rax, 8(%rdi)
171; SSE-NEXT:    movntiq %rax, (%rdi)
172; SSE-NEXT:    movntiq %rax, 24(%rdi)
173; SSE-NEXT:    movntiq %rax, 16(%rdi)
174; SSE-NEXT:    retq
175;
176; AVX-LABEL: test_zero_v4f64_align1:
177; AVX:       # %bb.0:
178; AVX-NEXT:    xorl %eax, %eax
179; AVX-NEXT:    movntiq %rax, 8(%rdi)
180; AVX-NEXT:    movntiq %rax, (%rdi)
181; AVX-NEXT:    movntiq %rax, 24(%rdi)
182; AVX-NEXT:    movntiq %rax, 16(%rdi)
183; AVX-NEXT:    retq
184;
185; AVX512-LABEL: test_zero_v4f64_align1:
186; AVX512:       # %bb.0:
187; AVX512-NEXT:    xorl %eax, %eax
188; AVX512-NEXT:    movntiq %rax, 8(%rdi)
189; AVX512-NEXT:    movntiq %rax, (%rdi)
190; AVX512-NEXT:    movntiq %rax, 24(%rdi)
191; AVX512-NEXT:    movntiq %rax, 16(%rdi)
192; AVX512-NEXT:    retq
193  store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1
194  ret void
195}
196
197define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind {
198; SSE2-LABEL: test_zero_v8f32_align1:
199; SSE2:       # %bb.0:
200; SSE2-NEXT:    xorl %eax, %eax
201; SSE2-NEXT:    movntiq %rax, 8(%rdi)
202; SSE2-NEXT:    movntiq %rax, (%rdi)
203; SSE2-NEXT:    movntiq %rax, 24(%rdi)
204; SSE2-NEXT:    movntiq %rax, 16(%rdi)
205; SSE2-NEXT:    retq
206;
207; SSE4A-LABEL: test_zero_v8f32_align1:
208; SSE4A:       # %bb.0:
209; SSE4A-NEXT:    xorl %eax, %eax
210; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
211; SSE4A-NEXT:    movntiq %rax, 24(%rdi)
212; SSE4A-NEXT:    xorps %xmm0, %xmm0
213; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
214; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
215; SSE4A-NEXT:    retq
216;
217; SSE41-LABEL: test_zero_v8f32_align1:
218; SSE41:       # %bb.0:
219; SSE41-NEXT:    xorl %eax, %eax
220; SSE41-NEXT:    movntiq %rax, 8(%rdi)
221; SSE41-NEXT:    movntiq %rax, (%rdi)
222; SSE41-NEXT:    movntiq %rax, 24(%rdi)
223; SSE41-NEXT:    movntiq %rax, 16(%rdi)
224; SSE41-NEXT:    retq
225;
226; AVX-LABEL: test_zero_v8f32_align1:
227; AVX:       # %bb.0:
228; AVX-NEXT:    xorl %eax, %eax
229; AVX-NEXT:    movntiq %rax, 8(%rdi)
230; AVX-NEXT:    movntiq %rax, (%rdi)
231; AVX-NEXT:    movntiq %rax, 24(%rdi)
232; AVX-NEXT:    movntiq %rax, 16(%rdi)
233; AVX-NEXT:    retq
234;
235; AVX512-LABEL: test_zero_v8f32_align1:
236; AVX512:       # %bb.0:
237; AVX512-NEXT:    xorl %eax, %eax
238; AVX512-NEXT:    movntiq %rax, 8(%rdi)
239; AVX512-NEXT:    movntiq %rax, (%rdi)
240; AVX512-NEXT:    movntiq %rax, 24(%rdi)
241; AVX512-NEXT:    movntiq %rax, 16(%rdi)
242; AVX512-NEXT:    retq
243  store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1
244  ret void
245}
246
247define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind {
248; SSE2-LABEL: test_zero_v4i64_align1:
249; SSE2:       # %bb.0:
250; SSE2-NEXT:    xorl %eax, %eax
251; SSE2-NEXT:    movntiq %rax, 8(%rdi)
252; SSE2-NEXT:    movntiq %rax, (%rdi)
253; SSE2-NEXT:    movntiq %rax, 24(%rdi)
254; SSE2-NEXT:    movntiq %rax, 16(%rdi)
255; SSE2-NEXT:    retq
256;
257; SSE4A-LABEL: test_zero_v4i64_align1:
258; SSE4A:       # %bb.0:
259; SSE4A-NEXT:    xorps %xmm0, %xmm0
260; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
261; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
262; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
263; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
264; SSE4A-NEXT:    retq
265;
266; SSE41-LABEL: test_zero_v4i64_align1:
267; SSE41:       # %bb.0:
268; SSE41-NEXT:    xorl %eax, %eax
269; SSE41-NEXT:    movntiq %rax, 8(%rdi)
270; SSE41-NEXT:    movntiq %rax, (%rdi)
271; SSE41-NEXT:    movntiq %rax, 24(%rdi)
272; SSE41-NEXT:    movntiq %rax, 16(%rdi)
273; SSE41-NEXT:    retq
274;
275; AVX-LABEL: test_zero_v4i64_align1:
276; AVX:       # %bb.0:
277; AVX-NEXT:    xorl %eax, %eax
278; AVX-NEXT:    movntiq %rax, 8(%rdi)
279; AVX-NEXT:    movntiq %rax, (%rdi)
280; AVX-NEXT:    movntiq %rax, 24(%rdi)
281; AVX-NEXT:    movntiq %rax, 16(%rdi)
282; AVX-NEXT:    retq
283;
284; AVX512-LABEL: test_zero_v4i64_align1:
285; AVX512:       # %bb.0:
286; AVX512-NEXT:    xorl %eax, %eax
287; AVX512-NEXT:    movntiq %rax, 8(%rdi)
288; AVX512-NEXT:    movntiq %rax, (%rdi)
289; AVX512-NEXT:    movntiq %rax, 24(%rdi)
290; AVX512-NEXT:    movntiq %rax, 16(%rdi)
291; AVX512-NEXT:    retq
292  store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1
293  ret void
294}
295
296define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind {
297; SSE2-LABEL: test_zero_v8i32_align1:
298; SSE2:       # %bb.0:
299; SSE2-NEXT:    xorl %eax, %eax
300; SSE2-NEXT:    movntiq %rax, 8(%rdi)
301; SSE2-NEXT:    movntiq %rax, (%rdi)
302; SSE2-NEXT:    movntiq %rax, 24(%rdi)
303; SSE2-NEXT:    movntiq %rax, 16(%rdi)
304; SSE2-NEXT:    retq
305;
306; SSE4A-LABEL: test_zero_v8i32_align1:
307; SSE4A:       # %bb.0:
308; SSE4A-NEXT:    xorps %xmm0, %xmm0
309; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
310; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
311; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
312; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
313; SSE4A-NEXT:    retq
314;
315; SSE41-LABEL: test_zero_v8i32_align1:
316; SSE41:       # %bb.0:
317; SSE41-NEXT:    xorl %eax, %eax
318; SSE41-NEXT:    movntiq %rax, 8(%rdi)
319; SSE41-NEXT:    movntiq %rax, (%rdi)
320; SSE41-NEXT:    movntiq %rax, 24(%rdi)
321; SSE41-NEXT:    movntiq %rax, 16(%rdi)
322; SSE41-NEXT:    retq
323;
324; AVX-LABEL: test_zero_v8i32_align1:
325; AVX:       # %bb.0:
326; AVX-NEXT:    xorl %eax, %eax
327; AVX-NEXT:    movntiq %rax, 8(%rdi)
328; AVX-NEXT:    movntiq %rax, (%rdi)
329; AVX-NEXT:    movntiq %rax, 24(%rdi)
330; AVX-NEXT:    movntiq %rax, 16(%rdi)
331; AVX-NEXT:    retq
332;
333; AVX512-LABEL: test_zero_v8i32_align1:
334; AVX512:       # %bb.0:
335; AVX512-NEXT:    xorl %eax, %eax
336; AVX512-NEXT:    movntiq %rax, 8(%rdi)
337; AVX512-NEXT:    movntiq %rax, (%rdi)
338; AVX512-NEXT:    movntiq %rax, 24(%rdi)
339; AVX512-NEXT:    movntiq %rax, 16(%rdi)
340; AVX512-NEXT:    retq
341  store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1
342  ret void
343}
344
345define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind {
346; SSE2-LABEL: test_zero_v16i16_align1:
347; SSE2:       # %bb.0:
348; SSE2-NEXT:    xorl %eax, %eax
349; SSE2-NEXT:    movntiq %rax, 8(%rdi)
350; SSE2-NEXT:    movntiq %rax, (%rdi)
351; SSE2-NEXT:    movntiq %rax, 24(%rdi)
352; SSE2-NEXT:    movntiq %rax, 16(%rdi)
353; SSE2-NEXT:    retq
354;
355; SSE4A-LABEL: test_zero_v16i16_align1:
356; SSE4A:       # %bb.0:
357; SSE4A-NEXT:    xorps %xmm0, %xmm0
358; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
359; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
360; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
361; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
362; SSE4A-NEXT:    retq
363;
364; SSE41-LABEL: test_zero_v16i16_align1:
365; SSE41:       # %bb.0:
366; SSE41-NEXT:    xorl %eax, %eax
367; SSE41-NEXT:    movntiq %rax, 8(%rdi)
368; SSE41-NEXT:    movntiq %rax, (%rdi)
369; SSE41-NEXT:    movntiq %rax, 24(%rdi)
370; SSE41-NEXT:    movntiq %rax, 16(%rdi)
371; SSE41-NEXT:    retq
372;
373; AVX-LABEL: test_zero_v16i16_align1:
374; AVX:       # %bb.0:
375; AVX-NEXT:    xorl %eax, %eax
376; AVX-NEXT:    movntiq %rax, 8(%rdi)
377; AVX-NEXT:    movntiq %rax, (%rdi)
378; AVX-NEXT:    movntiq %rax, 24(%rdi)
379; AVX-NEXT:    movntiq %rax, 16(%rdi)
380; AVX-NEXT:    retq
381;
382; AVX512-LABEL: test_zero_v16i16_align1:
383; AVX512:       # %bb.0:
384; AVX512-NEXT:    xorl %eax, %eax
385; AVX512-NEXT:    movntiq %rax, 8(%rdi)
386; AVX512-NEXT:    movntiq %rax, (%rdi)
387; AVX512-NEXT:    movntiq %rax, 24(%rdi)
388; AVX512-NEXT:    movntiq %rax, 16(%rdi)
389; AVX512-NEXT:    retq
390  store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1
391  ret void
392}
393
394define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind {
395; SSE2-LABEL: test_zero_v32i8_align1:
396; SSE2:       # %bb.0:
397; SSE2-NEXT:    xorl %eax, %eax
398; SSE2-NEXT:    movntiq %rax, 8(%rdi)
399; SSE2-NEXT:    movntiq %rax, (%rdi)
400; SSE2-NEXT:    movntiq %rax, 24(%rdi)
401; SSE2-NEXT:    movntiq %rax, 16(%rdi)
402; SSE2-NEXT:    retq
403;
404; SSE4A-LABEL: test_zero_v32i8_align1:
405; SSE4A:       # %bb.0:
406; SSE4A-NEXT:    xorps %xmm0, %xmm0
407; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
408; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
409; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
410; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
411; SSE4A-NEXT:    retq
412;
413; SSE41-LABEL: test_zero_v32i8_align1:
414; SSE41:       # %bb.0:
415; SSE41-NEXT:    xorl %eax, %eax
416; SSE41-NEXT:    movntiq %rax, 8(%rdi)
417; SSE41-NEXT:    movntiq %rax, (%rdi)
418; SSE41-NEXT:    movntiq %rax, 24(%rdi)
419; SSE41-NEXT:    movntiq %rax, 16(%rdi)
420; SSE41-NEXT:    retq
421;
422; AVX-LABEL: test_zero_v32i8_align1:
423; AVX:       # %bb.0:
424; AVX-NEXT:    xorl %eax, %eax
425; AVX-NEXT:    movntiq %rax, 8(%rdi)
426; AVX-NEXT:    movntiq %rax, (%rdi)
427; AVX-NEXT:    movntiq %rax, 24(%rdi)
428; AVX-NEXT:    movntiq %rax, 16(%rdi)
429; AVX-NEXT:    retq
430;
431; AVX512-LABEL: test_zero_v32i8_align1:
432; AVX512:       # %bb.0:
433; AVX512-NEXT:    xorl %eax, %eax
434; AVX512-NEXT:    movntiq %rax, 8(%rdi)
435; AVX512-NEXT:    movntiq %rax, (%rdi)
436; AVX512-NEXT:    movntiq %rax, 24(%rdi)
437; AVX512-NEXT:    movntiq %rax, 16(%rdi)
438; AVX512-NEXT:    retq
439  store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1
440  ret void
441}
442
443define void @test_zero_v4f64_align16(<4 x double>* %dst) nounwind {
444; SSE-LABEL: test_zero_v4f64_align16:
445; SSE:       # %bb.0:
446; SSE-NEXT:    xorps %xmm0, %xmm0
447; SSE-NEXT:    movntps %xmm0, 16(%rdi)
448; SSE-NEXT:    movntps %xmm0, (%rdi)
449; SSE-NEXT:    retq
450;
451; AVX-LABEL: test_zero_v4f64_align16:
452; AVX:       # %bb.0:
453; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
454; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
455; AVX-NEXT:    vmovntps %xmm0, (%rdi)
456; AVX-NEXT:    retq
457;
458; AVX512-LABEL: test_zero_v4f64_align16:
459; AVX512:       # %bb.0:
460; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
461; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
462; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
463; AVX512-NEXT:    retq
464  store <4 x double> zeroinitializer, <4 x double>* %dst, align 16, !nontemporal !1
465  ret void
466}
467
468define void @test_zero_v8f32_align16(<8 x float>* %dst) nounwind {
469; SSE-LABEL: test_zero_v8f32_align16:
470; SSE:       # %bb.0:
471; SSE-NEXT:    xorps %xmm0, %xmm0
472; SSE-NEXT:    movntps %xmm0, 16(%rdi)
473; SSE-NEXT:    movntps %xmm0, (%rdi)
474; SSE-NEXT:    retq
475;
476; AVX-LABEL: test_zero_v8f32_align16:
477; AVX:       # %bb.0:
478; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
479; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
480; AVX-NEXT:    vmovntps %xmm0, (%rdi)
481; AVX-NEXT:    retq
482;
483; AVX512-LABEL: test_zero_v8f32_align16:
484; AVX512:       # %bb.0:
485; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
486; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
487; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
488; AVX512-NEXT:    retq
489  store <8 x float> zeroinitializer, <8 x float>* %dst, align 16, !nontemporal !1
490  ret void
491}
492
493define void @test_zero_v4i64_align16(<4 x i64>* %dst) nounwind {
494; SSE-LABEL: test_zero_v4i64_align16:
495; SSE:       # %bb.0:
496; SSE-NEXT:    xorps %xmm0, %xmm0
497; SSE-NEXT:    movntps %xmm0, 16(%rdi)
498; SSE-NEXT:    movntps %xmm0, (%rdi)
499; SSE-NEXT:    retq
500;
501; AVX-LABEL: test_zero_v4i64_align16:
502; AVX:       # %bb.0:
503; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
504; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
505; AVX-NEXT:    vmovntps %xmm0, (%rdi)
506; AVX-NEXT:    retq
507;
508; AVX512-LABEL: test_zero_v4i64_align16:
509; AVX512:       # %bb.0:
510; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
511; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
512; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
513; AVX512-NEXT:    retq
514  store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 16, !nontemporal !1
515  ret void
516}
517
518define void @test_zero_v8i32_align16(<8 x i32>* %dst) nounwind {
519; SSE-LABEL: test_zero_v8i32_align16:
520; SSE:       # %bb.0:
521; SSE-NEXT:    xorps %xmm0, %xmm0
522; SSE-NEXT:    movntps %xmm0, 16(%rdi)
523; SSE-NEXT:    movntps %xmm0, (%rdi)
524; SSE-NEXT:    retq
525;
526; AVX-LABEL: test_zero_v8i32_align16:
527; AVX:       # %bb.0:
528; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
529; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
530; AVX-NEXT:    vmovntps %xmm0, (%rdi)
531; AVX-NEXT:    retq
532;
533; AVX512-LABEL: test_zero_v8i32_align16:
534; AVX512:       # %bb.0:
535; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
536; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
537; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
538; AVX512-NEXT:    retq
539  store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 16, !nontemporal !1
540  ret void
541}
542
543define void @test_zero_v16i16_align16(<16 x i16>* %dst) nounwind {
544; SSE-LABEL: test_zero_v16i16_align16:
545; SSE:       # %bb.0:
546; SSE-NEXT:    xorps %xmm0, %xmm0
547; SSE-NEXT:    movntps %xmm0, 16(%rdi)
548; SSE-NEXT:    movntps %xmm0, (%rdi)
549; SSE-NEXT:    retq
550;
551; AVX-LABEL: test_zero_v16i16_align16:
552; AVX:       # %bb.0:
553; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
554; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
555; AVX-NEXT:    vmovntps %xmm0, (%rdi)
556; AVX-NEXT:    retq
557;
558; AVX512-LABEL: test_zero_v16i16_align16:
559; AVX512:       # %bb.0:
560; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
561; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
562; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
563; AVX512-NEXT:    retq
564  store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 16, !nontemporal !1
565  ret void
566}
567
568define void @test_zero_v32i8_align16(<32 x i8>* %dst) nounwind {
569; SSE-LABEL: test_zero_v32i8_align16:
570; SSE:       # %bb.0:
571; SSE-NEXT:    xorps %xmm0, %xmm0
572; SSE-NEXT:    movntps %xmm0, 16(%rdi)
573; SSE-NEXT:    movntps %xmm0, (%rdi)
574; SSE-NEXT:    retq
575;
576; AVX-LABEL: test_zero_v32i8_align16:
577; AVX:       # %bb.0:
578; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
579; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
580; AVX-NEXT:    vmovntps %xmm0, (%rdi)
581; AVX-NEXT:    retq
582;
583; AVX512-LABEL: test_zero_v32i8_align16:
584; AVX512:       # %bb.0:
585; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
586; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
587; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
588; AVX512-NEXT:    retq
589  store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 16, !nontemporal !1
590  ret void
591}
592
593; ZMM versions.
594
595define void @test_zero_v8f64_align1(<8 x double>* %dst) nounwind {
596; SSE-LABEL: test_zero_v8f64_align1:
597; SSE:       # %bb.0:
598; SSE-NEXT:    xorl %eax, %eax
599; SSE-NEXT:    movntiq %rax, 24(%rdi)
600; SSE-NEXT:    movntiq %rax, 16(%rdi)
601; SSE-NEXT:    movntiq %rax, 8(%rdi)
602; SSE-NEXT:    movntiq %rax, (%rdi)
603; SSE-NEXT:    movntiq %rax, 56(%rdi)
604; SSE-NEXT:    movntiq %rax, 48(%rdi)
605; SSE-NEXT:    movntiq %rax, 40(%rdi)
606; SSE-NEXT:    movntiq %rax, 32(%rdi)
607; SSE-NEXT:    retq
608;
609; AVX-LABEL: test_zero_v8f64_align1:
610; AVX:       # %bb.0:
611; AVX-NEXT:    xorl %eax, %eax
612; AVX-NEXT:    movntiq %rax, 24(%rdi)
613; AVX-NEXT:    movntiq %rax, 16(%rdi)
614; AVX-NEXT:    movntiq %rax, 8(%rdi)
615; AVX-NEXT:    movntiq %rax, (%rdi)
616; AVX-NEXT:    movntiq %rax, 56(%rdi)
617; AVX-NEXT:    movntiq %rax, 48(%rdi)
618; AVX-NEXT:    movntiq %rax, 40(%rdi)
619; AVX-NEXT:    movntiq %rax, 32(%rdi)
620; AVX-NEXT:    retq
621;
622; AVX512-LABEL: test_zero_v8f64_align1:
623; AVX512:       # %bb.0:
624; AVX512-NEXT:    xorl %eax, %eax
625; AVX512-NEXT:    movntiq %rax, 24(%rdi)
626; AVX512-NEXT:    movntiq %rax, 16(%rdi)
627; AVX512-NEXT:    movntiq %rax, 8(%rdi)
628; AVX512-NEXT:    movntiq %rax, (%rdi)
629; AVX512-NEXT:    movntiq %rax, 56(%rdi)
630; AVX512-NEXT:    movntiq %rax, 48(%rdi)
631; AVX512-NEXT:    movntiq %rax, 40(%rdi)
632; AVX512-NEXT:    movntiq %rax, 32(%rdi)
633; AVX512-NEXT:    retq
634  store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1
635  ret void
636}
637
638define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind {
639; SSE2-LABEL: test_zero_v16f32_align1:
640; SSE2:       # %bb.0:
641; SSE2-NEXT:    xorl %eax, %eax
642; SSE2-NEXT:    movntiq %rax, 24(%rdi)
643; SSE2-NEXT:    movntiq %rax, 16(%rdi)
644; SSE2-NEXT:    movntiq %rax, 8(%rdi)
645; SSE2-NEXT:    movntiq %rax, (%rdi)
646; SSE2-NEXT:    movntiq %rax, 56(%rdi)
647; SSE2-NEXT:    movntiq %rax, 48(%rdi)
648; SSE2-NEXT:    movntiq %rax, 40(%rdi)
649; SSE2-NEXT:    movntiq %rax, 32(%rdi)
650; SSE2-NEXT:    retq
651;
652; SSE4A-LABEL: test_zero_v16f32_align1:
653; SSE4A:       # %bb.0:
654; SSE4A-NEXT:    xorl %eax, %eax
655; SSE4A-NEXT:    movntiq %rax, 24(%rdi)
656; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
657; SSE4A-NEXT:    movntiq %rax, 56(%rdi)
658; SSE4A-NEXT:    movntiq %rax, 40(%rdi)
659; SSE4A-NEXT:    xorps %xmm0, %xmm0
660; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
661; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
662; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
663; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
664; SSE4A-NEXT:    retq
665;
666; SSE41-LABEL: test_zero_v16f32_align1:
667; SSE41:       # %bb.0:
668; SSE41-NEXT:    xorl %eax, %eax
669; SSE41-NEXT:    movntiq %rax, 24(%rdi)
670; SSE41-NEXT:    movntiq %rax, 16(%rdi)
671; SSE41-NEXT:    movntiq %rax, 8(%rdi)
672; SSE41-NEXT:    movntiq %rax, (%rdi)
673; SSE41-NEXT:    movntiq %rax, 56(%rdi)
674; SSE41-NEXT:    movntiq %rax, 48(%rdi)
675; SSE41-NEXT:    movntiq %rax, 40(%rdi)
676; SSE41-NEXT:    movntiq %rax, 32(%rdi)
677; SSE41-NEXT:    retq
678;
679; AVX-LABEL: test_zero_v16f32_align1:
680; AVX:       # %bb.0:
681; AVX-NEXT:    xorl %eax, %eax
682; AVX-NEXT:    movntiq %rax, 24(%rdi)
683; AVX-NEXT:    movntiq %rax, 16(%rdi)
684; AVX-NEXT:    movntiq %rax, 8(%rdi)
685; AVX-NEXT:    movntiq %rax, (%rdi)
686; AVX-NEXT:    movntiq %rax, 56(%rdi)
687; AVX-NEXT:    movntiq %rax, 48(%rdi)
688; AVX-NEXT:    movntiq %rax, 40(%rdi)
689; AVX-NEXT:    movntiq %rax, 32(%rdi)
690; AVX-NEXT:    retq
691;
692; AVX512-LABEL: test_zero_v16f32_align1:
693; AVX512:       # %bb.0:
694; AVX512-NEXT:    xorl %eax, %eax
695; AVX512-NEXT:    movntiq %rax, 24(%rdi)
696; AVX512-NEXT:    movntiq %rax, 16(%rdi)
697; AVX512-NEXT:    movntiq %rax, 8(%rdi)
698; AVX512-NEXT:    movntiq %rax, (%rdi)
699; AVX512-NEXT:    movntiq %rax, 56(%rdi)
700; AVX512-NEXT:    movntiq %rax, 48(%rdi)
701; AVX512-NEXT:    movntiq %rax, 40(%rdi)
702; AVX512-NEXT:    movntiq %rax, 32(%rdi)
703; AVX512-NEXT:    retq
704  store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1
705  ret void
706}
707
708define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind {
709; SSE2-LABEL: test_zero_v8i64_align1:
710; SSE2:       # %bb.0:
711; SSE2-NEXT:    xorl %eax, %eax
712; SSE2-NEXT:    movntiq %rax, 24(%rdi)
713; SSE2-NEXT:    movntiq %rax, 16(%rdi)
714; SSE2-NEXT:    movntiq %rax, 8(%rdi)
715; SSE2-NEXT:    movntiq %rax, (%rdi)
716; SSE2-NEXT:    movntiq %rax, 56(%rdi)
717; SSE2-NEXT:    movntiq %rax, 48(%rdi)
718; SSE2-NEXT:    movntiq %rax, 40(%rdi)
719; SSE2-NEXT:    movntiq %rax, 32(%rdi)
720; SSE2-NEXT:    retq
721;
722; SSE4A-LABEL: test_zero_v8i64_align1:
723; SSE4A:       # %bb.0:
724; SSE4A-NEXT:    xorps %xmm0, %xmm0
725; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
726; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
727; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
728; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
729; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
730; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
731; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
732; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
733; SSE4A-NEXT:    retq
734;
735; SSE41-LABEL: test_zero_v8i64_align1:
736; SSE41:       # %bb.0:
737; SSE41-NEXT:    xorl %eax, %eax
738; SSE41-NEXT:    movntiq %rax, 24(%rdi)
739; SSE41-NEXT:    movntiq %rax, 16(%rdi)
740; SSE41-NEXT:    movntiq %rax, 8(%rdi)
741; SSE41-NEXT:    movntiq %rax, (%rdi)
742; SSE41-NEXT:    movntiq %rax, 56(%rdi)
743; SSE41-NEXT:    movntiq %rax, 48(%rdi)
744; SSE41-NEXT:    movntiq %rax, 40(%rdi)
745; SSE41-NEXT:    movntiq %rax, 32(%rdi)
746; SSE41-NEXT:    retq
747;
748; AVX-LABEL: test_zero_v8i64_align1:
749; AVX:       # %bb.0:
750; AVX-NEXT:    xorl %eax, %eax
751; AVX-NEXT:    movntiq %rax, 24(%rdi)
752; AVX-NEXT:    movntiq %rax, 16(%rdi)
753; AVX-NEXT:    movntiq %rax, 8(%rdi)
754; AVX-NEXT:    movntiq %rax, (%rdi)
755; AVX-NEXT:    movntiq %rax, 56(%rdi)
756; AVX-NEXT:    movntiq %rax, 48(%rdi)
757; AVX-NEXT:    movntiq %rax, 40(%rdi)
758; AVX-NEXT:    movntiq %rax, 32(%rdi)
759; AVX-NEXT:    retq
760;
761; AVX512-LABEL: test_zero_v8i64_align1:
762; AVX512:       # %bb.0:
763; AVX512-NEXT:    xorl %eax, %eax
764; AVX512-NEXT:    movntiq %rax, 24(%rdi)
765; AVX512-NEXT:    movntiq %rax, 16(%rdi)
766; AVX512-NEXT:    movntiq %rax, 8(%rdi)
767; AVX512-NEXT:    movntiq %rax, (%rdi)
768; AVX512-NEXT:    movntiq %rax, 56(%rdi)
769; AVX512-NEXT:    movntiq %rax, 48(%rdi)
770; AVX512-NEXT:    movntiq %rax, 40(%rdi)
771; AVX512-NEXT:    movntiq %rax, 32(%rdi)
772; AVX512-NEXT:    retq
773  store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1
774  ret void
775}
776
777define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind {
778; SSE2-LABEL: test_zero_v16i32_align1:
779; SSE2:       # %bb.0:
780; SSE2-NEXT:    xorl %eax, %eax
781; SSE2-NEXT:    movntiq %rax, 24(%rdi)
782; SSE2-NEXT:    movntiq %rax, 16(%rdi)
783; SSE2-NEXT:    movntiq %rax, 8(%rdi)
784; SSE2-NEXT:    movntiq %rax, (%rdi)
785; SSE2-NEXT:    movntiq %rax, 56(%rdi)
786; SSE2-NEXT:    movntiq %rax, 48(%rdi)
787; SSE2-NEXT:    movntiq %rax, 40(%rdi)
788; SSE2-NEXT:    movntiq %rax, 32(%rdi)
789; SSE2-NEXT:    retq
790;
791; SSE4A-LABEL: test_zero_v16i32_align1:
792; SSE4A:       # %bb.0:
793; SSE4A-NEXT:    xorps %xmm0, %xmm0
794; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
795; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
796; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
797; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
798; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
799; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
800; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
801; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
802; SSE4A-NEXT:    retq
803;
804; SSE41-LABEL: test_zero_v16i32_align1:
805; SSE41:       # %bb.0:
806; SSE41-NEXT:    xorl %eax, %eax
807; SSE41-NEXT:    movntiq %rax, 24(%rdi)
808; SSE41-NEXT:    movntiq %rax, 16(%rdi)
809; SSE41-NEXT:    movntiq %rax, 8(%rdi)
810; SSE41-NEXT:    movntiq %rax, (%rdi)
811; SSE41-NEXT:    movntiq %rax, 56(%rdi)
812; SSE41-NEXT:    movntiq %rax, 48(%rdi)
813; SSE41-NEXT:    movntiq %rax, 40(%rdi)
814; SSE41-NEXT:    movntiq %rax, 32(%rdi)
815; SSE41-NEXT:    retq
816;
817; AVX-LABEL: test_zero_v16i32_align1:
818; AVX:       # %bb.0:
819; AVX-NEXT:    xorl %eax, %eax
820; AVX-NEXT:    movntiq %rax, 24(%rdi)
821; AVX-NEXT:    movntiq %rax, 16(%rdi)
822; AVX-NEXT:    movntiq %rax, 8(%rdi)
823; AVX-NEXT:    movntiq %rax, (%rdi)
824; AVX-NEXT:    movntiq %rax, 56(%rdi)
825; AVX-NEXT:    movntiq %rax, 48(%rdi)
826; AVX-NEXT:    movntiq %rax, 40(%rdi)
827; AVX-NEXT:    movntiq %rax, 32(%rdi)
828; AVX-NEXT:    retq
829;
830; AVX512-LABEL: test_zero_v16i32_align1:
831; AVX512:       # %bb.0:
832; AVX512-NEXT:    xorl %eax, %eax
833; AVX512-NEXT:    movntiq %rax, 24(%rdi)
834; AVX512-NEXT:    movntiq %rax, 16(%rdi)
835; AVX512-NEXT:    movntiq %rax, 8(%rdi)
836; AVX512-NEXT:    movntiq %rax, (%rdi)
837; AVX512-NEXT:    movntiq %rax, 56(%rdi)
838; AVX512-NEXT:    movntiq %rax, 48(%rdi)
839; AVX512-NEXT:    movntiq %rax, 40(%rdi)
840; AVX512-NEXT:    movntiq %rax, 32(%rdi)
841; AVX512-NEXT:    retq
842  store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1
843  ret void
844}
845
846define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind {
847; SSE2-LABEL: test_zero_v32i16_align1:
848; SSE2:       # %bb.0:
849; SSE2-NEXT:    xorl %eax, %eax
850; SSE2-NEXT:    movntiq %rax, 24(%rdi)
851; SSE2-NEXT:    movntiq %rax, 16(%rdi)
852; SSE2-NEXT:    movntiq %rax, 8(%rdi)
853; SSE2-NEXT:    movntiq %rax, (%rdi)
854; SSE2-NEXT:    movntiq %rax, 56(%rdi)
855; SSE2-NEXT:    movntiq %rax, 48(%rdi)
856; SSE2-NEXT:    movntiq %rax, 40(%rdi)
857; SSE2-NEXT:    movntiq %rax, 32(%rdi)
858; SSE2-NEXT:    retq
859;
860; SSE4A-LABEL: test_zero_v32i16_align1:
861; SSE4A:       # %bb.0:
862; SSE4A-NEXT:    xorps %xmm0, %xmm0
863; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
864; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
865; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
866; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
867; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
868; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
869; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
870; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
871; SSE4A-NEXT:    retq
872;
873; SSE41-LABEL: test_zero_v32i16_align1:
874; SSE41:       # %bb.0:
875; SSE41-NEXT:    xorl %eax, %eax
876; SSE41-NEXT:    movntiq %rax, 24(%rdi)
877; SSE41-NEXT:    movntiq %rax, 16(%rdi)
878; SSE41-NEXT:    movntiq %rax, 8(%rdi)
879; SSE41-NEXT:    movntiq %rax, (%rdi)
880; SSE41-NEXT:    movntiq %rax, 56(%rdi)
881; SSE41-NEXT:    movntiq %rax, 48(%rdi)
882; SSE41-NEXT:    movntiq %rax, 40(%rdi)
883; SSE41-NEXT:    movntiq %rax, 32(%rdi)
884; SSE41-NEXT:    retq
885;
886; AVX-LABEL: test_zero_v32i16_align1:
887; AVX:       # %bb.0:
888; AVX-NEXT:    xorl %eax, %eax
889; AVX-NEXT:    movntiq %rax, 24(%rdi)
890; AVX-NEXT:    movntiq %rax, 16(%rdi)
891; AVX-NEXT:    movntiq %rax, 8(%rdi)
892; AVX-NEXT:    movntiq %rax, (%rdi)
893; AVX-NEXT:    movntiq %rax, 56(%rdi)
894; AVX-NEXT:    movntiq %rax, 48(%rdi)
895; AVX-NEXT:    movntiq %rax, 40(%rdi)
896; AVX-NEXT:    movntiq %rax, 32(%rdi)
897; AVX-NEXT:    retq
898;
899; AVX512-LABEL: test_zero_v32i16_align1:
900; AVX512:       # %bb.0:
901; AVX512-NEXT:    xorl %eax, %eax
902; AVX512-NEXT:    movntiq %rax, 24(%rdi)
903; AVX512-NEXT:    movntiq %rax, 16(%rdi)
904; AVX512-NEXT:    movntiq %rax, 8(%rdi)
905; AVX512-NEXT:    movntiq %rax, (%rdi)
906; AVX512-NEXT:    movntiq %rax, 56(%rdi)
907; AVX512-NEXT:    movntiq %rax, 48(%rdi)
908; AVX512-NEXT:    movntiq %rax, 40(%rdi)
909; AVX512-NEXT:    movntiq %rax, 32(%rdi)
910; AVX512-NEXT:    retq
911  store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1
912  ret void
913}
914
915define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind {
916; SSE2-LABEL: test_zero_v64i8_align1:
917; SSE2:       # %bb.0:
918; SSE2-NEXT:    xorl %eax, %eax
919; SSE2-NEXT:    movntiq %rax, 24(%rdi)
920; SSE2-NEXT:    movntiq %rax, 16(%rdi)
921; SSE2-NEXT:    movntiq %rax, 8(%rdi)
922; SSE2-NEXT:    movntiq %rax, (%rdi)
923; SSE2-NEXT:    movntiq %rax, 56(%rdi)
924; SSE2-NEXT:    movntiq %rax, 48(%rdi)
925; SSE2-NEXT:    movntiq %rax, 40(%rdi)
926; SSE2-NEXT:    movntiq %rax, 32(%rdi)
927; SSE2-NEXT:    retq
928;
929; SSE4A-LABEL: test_zero_v64i8_align1:
930; SSE4A:       # %bb.0:
931; SSE4A-NEXT:    xorps %xmm0, %xmm0
932; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
933; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
934; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
935; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
936; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
937; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
938; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
939; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
940; SSE4A-NEXT:    retq
941;
942; SSE41-LABEL: test_zero_v64i8_align1:
943; SSE41:       # %bb.0:
944; SSE41-NEXT:    xorl %eax, %eax
945; SSE41-NEXT:    movntiq %rax, 24(%rdi)
946; SSE41-NEXT:    movntiq %rax, 16(%rdi)
947; SSE41-NEXT:    movntiq %rax, 8(%rdi)
948; SSE41-NEXT:    movntiq %rax, (%rdi)
949; SSE41-NEXT:    movntiq %rax, 56(%rdi)
950; SSE41-NEXT:    movntiq %rax, 48(%rdi)
951; SSE41-NEXT:    movntiq %rax, 40(%rdi)
952; SSE41-NEXT:    movntiq %rax, 32(%rdi)
953; SSE41-NEXT:    retq
954;
955; AVX-LABEL: test_zero_v64i8_align1:
956; AVX:       # %bb.0:
957; AVX-NEXT:    xorl %eax, %eax
958; AVX-NEXT:    movntiq %rax, 24(%rdi)
959; AVX-NEXT:    movntiq %rax, 16(%rdi)
960; AVX-NEXT:    movntiq %rax, 8(%rdi)
961; AVX-NEXT:    movntiq %rax, (%rdi)
962; AVX-NEXT:    movntiq %rax, 56(%rdi)
963; AVX-NEXT:    movntiq %rax, 48(%rdi)
964; AVX-NEXT:    movntiq %rax, 40(%rdi)
965; AVX-NEXT:    movntiq %rax, 32(%rdi)
966; AVX-NEXT:    retq
967;
968; AVX512-LABEL: test_zero_v64i8_align1:
969; AVX512:       # %bb.0:
970; AVX512-NEXT:    xorl %eax, %eax
971; AVX512-NEXT:    movntiq %rax, 24(%rdi)
972; AVX512-NEXT:    movntiq %rax, 16(%rdi)
973; AVX512-NEXT:    movntiq %rax, 8(%rdi)
974; AVX512-NEXT:    movntiq %rax, (%rdi)
975; AVX512-NEXT:    movntiq %rax, 56(%rdi)
976; AVX512-NEXT:    movntiq %rax, 48(%rdi)
977; AVX512-NEXT:    movntiq %rax, 40(%rdi)
978; AVX512-NEXT:    movntiq %rax, 32(%rdi)
979; AVX512-NEXT:    retq
980  store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1
981  ret void
982}
983
984define void @test_zero_v8f64_align16(<8 x double>* %dst) nounwind {
985; SSE-LABEL: test_zero_v8f64_align16:
986; SSE:       # %bb.0:
987; SSE-NEXT:    xorps %xmm0, %xmm0
988; SSE-NEXT:    movntps %xmm0, 16(%rdi)
989; SSE-NEXT:    movntps %xmm0, (%rdi)
990; SSE-NEXT:    movntps %xmm0, 48(%rdi)
991; SSE-NEXT:    movntps %xmm0, 32(%rdi)
992; SSE-NEXT:    retq
993;
994; AVX-LABEL: test_zero_v8f64_align16:
995; AVX:       # %bb.0:
996; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
997; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
998; AVX-NEXT:    vmovntps %xmm0, (%rdi)
999; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
1000; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
1001; AVX-NEXT:    retq
1002;
1003; AVX512-LABEL: test_zero_v8f64_align16:
1004; AVX512:       # %bb.0:
1005; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1006; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
1007; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
1008; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
1009; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
1010; AVX512-NEXT:    retq
1011  store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1
1012  ret void
1013}
1014
1015define void @test_zero_v16f32_align16(<16 x float>* %dst) nounwind {
1016; SSE-LABEL: test_zero_v16f32_align16:
1017; SSE:       # %bb.0:
1018; SSE-NEXT:    xorps %xmm0, %xmm0
1019; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1020; SSE-NEXT:    movntps %xmm0, (%rdi)
1021; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1022; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1023; SSE-NEXT:    retq
1024;
1025; AVX-LABEL: test_zero_v16f32_align16:
1026; AVX:       # %bb.0:
1027; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1028; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
1029; AVX-NEXT:    vmovntps %xmm0, (%rdi)
1030; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
1031; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
1032; AVX-NEXT:    retq
1033;
1034; AVX512-LABEL: test_zero_v16f32_align16:
1035; AVX512:       # %bb.0:
1036; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1037; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
1038; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
1039; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
1040; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
1041; AVX512-NEXT:    retq
1042  store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1
1043  ret void
1044}
1045
1046define void @test_zero_v8i64_align16(<8 x i64>* %dst) nounwind {
1047; SSE-LABEL: test_zero_v8i64_align16:
1048; SSE:       # %bb.0:
1049; SSE-NEXT:    xorps %xmm0, %xmm0
1050; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1051; SSE-NEXT:    movntps %xmm0, (%rdi)
1052; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1053; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1054; SSE-NEXT:    retq
1055;
1056; AVX-LABEL: test_zero_v8i64_align16:
1057; AVX:       # %bb.0:
1058; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1059; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
1060; AVX-NEXT:    vmovntps %xmm0, (%rdi)
1061; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
1062; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
1063; AVX-NEXT:    retq
1064;
1065; AVX512-LABEL: test_zero_v8i64_align16:
1066; AVX512:       # %bb.0:
1067; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1068; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
1069; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
1070; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
1071; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
1072; AVX512-NEXT:    retq
1073  store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1
1074  ret void
1075}
1076
1077define void @test_zero_v16i32_align16(<16 x i32>* %dst) nounwind {
1078; SSE-LABEL: test_zero_v16i32_align16:
1079; SSE:       # %bb.0:
1080; SSE-NEXT:    xorps %xmm0, %xmm0
1081; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1082; SSE-NEXT:    movntps %xmm0, (%rdi)
1083; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1084; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1085; SSE-NEXT:    retq
1086;
1087; AVX-LABEL: test_zero_v16i32_align16:
1088; AVX:       # %bb.0:
1089; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1090; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
1091; AVX-NEXT:    vmovntps %xmm0, (%rdi)
1092; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
1093; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
1094; AVX-NEXT:    retq
1095;
1096; AVX512-LABEL: test_zero_v16i32_align16:
1097; AVX512:       # %bb.0:
1098; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1099; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
1100; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
1101; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
1102; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
1103; AVX512-NEXT:    retq
1104  store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1
1105  ret void
1106}
1107
1108define void @test_zero_v32i16_align16(<32 x i16>* %dst) nounwind {
1109; SSE-LABEL: test_zero_v32i16_align16:
1110; SSE:       # %bb.0:
1111; SSE-NEXT:    xorps %xmm0, %xmm0
1112; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1113; SSE-NEXT:    movntps %xmm0, (%rdi)
1114; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1115; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1116; SSE-NEXT:    retq
1117;
1118; AVX-LABEL: test_zero_v32i16_align16:
1119; AVX:       # %bb.0:
1120; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1121; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
1122; AVX-NEXT:    vmovntps %xmm0, (%rdi)
1123; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
1124; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
1125; AVX-NEXT:    retq
1126;
1127; AVX512-LABEL: test_zero_v32i16_align16:
1128; AVX512:       # %bb.0:
1129; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1130; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
1131; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
1132; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
1133; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
1134; AVX512-NEXT:    retq
1135  store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1
1136  ret void
1137}
1138
1139define void @test_zero_v64i8_align16(<64 x i8>* %dst) nounwind {
1140; SSE-LABEL: test_zero_v64i8_align16:
1141; SSE:       # %bb.0:
1142; SSE-NEXT:    xorps %xmm0, %xmm0
1143; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1144; SSE-NEXT:    movntps %xmm0, (%rdi)
1145; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1146; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1147; SSE-NEXT:    retq
1148;
1149; AVX-LABEL: test_zero_v64i8_align16:
1150; AVX:       # %bb.0:
1151; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1152; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
1153; AVX-NEXT:    vmovntps %xmm0, (%rdi)
1154; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
1155; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
1156; AVX-NEXT:    retq
1157;
1158; AVX512-LABEL: test_zero_v64i8_align16:
1159; AVX512:       # %bb.0:
1160; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1161; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
1162; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
1163; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
1164; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
1165; AVX512-NEXT:    retq
1166  store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1
1167  ret void
1168}
1169
1170define void @test_zero_v8f64_align32(<8 x double>* %dst) nounwind {
1171; SSE-LABEL: test_zero_v8f64_align32:
1172; SSE:       # %bb.0:
1173; SSE-NEXT:    xorps %xmm0, %xmm0
1174; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1175; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1176; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1177; SSE-NEXT:    movntps %xmm0, (%rdi)
1178; SSE-NEXT:    retq
1179;
1180; AVX-LABEL: test_zero_v8f64_align32:
1181; AVX:       # %bb.0:
1182; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1183; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
1184; AVX-NEXT:    vmovntps %ymm0, (%rdi)
1185; AVX-NEXT:    vzeroupper
1186; AVX-NEXT:    retq
1187;
1188; AVX512-LABEL: test_zero_v8f64_align32:
1189; AVX512:       # %bb.0:
1190; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1191; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
1192; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
1193; AVX512-NEXT:    vzeroupper
1194; AVX512-NEXT:    retq
1195  store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1
1196  ret void
1197}
1198
1199define void @test_zero_v16f32_align32(<16 x float>* %dst) nounwind {
1200; SSE-LABEL: test_zero_v16f32_align32:
1201; SSE:       # %bb.0:
1202; SSE-NEXT:    xorps %xmm0, %xmm0
1203; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1204; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1205; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1206; SSE-NEXT:    movntps %xmm0, (%rdi)
1207; SSE-NEXT:    retq
1208;
1209; AVX-LABEL: test_zero_v16f32_align32:
1210; AVX:       # %bb.0:
1211; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1212; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
1213; AVX-NEXT:    vmovntps %ymm0, (%rdi)
1214; AVX-NEXT:    vzeroupper
1215; AVX-NEXT:    retq
1216;
1217; AVX512-LABEL: test_zero_v16f32_align32:
1218; AVX512:       # %bb.0:
1219; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1220; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
1221; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
1222; AVX512-NEXT:    vzeroupper
1223; AVX512-NEXT:    retq
1224  store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1
1225  ret void
1226}
1227
1228define void @test_zero_v8i64_align32(<8 x i64>* %dst) nounwind {
1229; SSE-LABEL: test_zero_v8i64_align32:
1230; SSE:       # %bb.0:
1231; SSE-NEXT:    xorps %xmm0, %xmm0
1232; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1233; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1234; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1235; SSE-NEXT:    movntps %xmm0, (%rdi)
1236; SSE-NEXT:    retq
1237;
1238; AVX-LABEL: test_zero_v8i64_align32:
1239; AVX:       # %bb.0:
1240; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1241; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
1242; AVX-NEXT:    vmovntps %ymm0, (%rdi)
1243; AVX-NEXT:    vzeroupper
1244; AVX-NEXT:    retq
1245;
1246; AVX512-LABEL: test_zero_v8i64_align32:
1247; AVX512:       # %bb.0:
1248; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1249; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
1250; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
1251; AVX512-NEXT:    vzeroupper
1252; AVX512-NEXT:    retq
1253  store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1
1254  ret void
1255}
1256
1257define void @test_zero_v16i32_align32(<16 x i32>* %dst) nounwind {
1258; SSE-LABEL: test_zero_v16i32_align32:
1259; SSE:       # %bb.0:
1260; SSE-NEXT:    xorps %xmm0, %xmm0
1261; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1262; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1263; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1264; SSE-NEXT:    movntps %xmm0, (%rdi)
1265; SSE-NEXT:    retq
1266;
1267; AVX-LABEL: test_zero_v16i32_align32:
1268; AVX:       # %bb.0:
1269; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1270; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
1271; AVX-NEXT:    vmovntps %ymm0, (%rdi)
1272; AVX-NEXT:    vzeroupper
1273; AVX-NEXT:    retq
1274;
1275; AVX512-LABEL: test_zero_v16i32_align32:
1276; AVX512:       # %bb.0:
1277; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1278; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
1279; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
1280; AVX512-NEXT:    vzeroupper
1281; AVX512-NEXT:    retq
1282  store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1
1283  ret void
1284}
1285
1286define void @test_zero_v32i16_align32(<32 x i16>* %dst) nounwind {
1287; SSE-LABEL: test_zero_v32i16_align32:
1288; SSE:       # %bb.0:
1289; SSE-NEXT:    xorps %xmm0, %xmm0
1290; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1291; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1292; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1293; SSE-NEXT:    movntps %xmm0, (%rdi)
1294; SSE-NEXT:    retq
1295;
1296; AVX-LABEL: test_zero_v32i16_align32:
1297; AVX:       # %bb.0:
1298; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1299; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
1300; AVX-NEXT:    vmovntps %ymm0, (%rdi)
1301; AVX-NEXT:    vzeroupper
1302; AVX-NEXT:    retq
1303;
1304; AVX512-LABEL: test_zero_v32i16_align32:
1305; AVX512:       # %bb.0:
1306; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1307; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
1308; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
1309; AVX512-NEXT:    vzeroupper
1310; AVX512-NEXT:    retq
1311  store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1
1312  ret void
1313}
1314
1315define void @test_zero_v64i8_align32(<64 x i8>* %dst) nounwind {
1316; SSE-LABEL: test_zero_v64i8_align32:
1317; SSE:       # %bb.0:
1318; SSE-NEXT:    xorps %xmm0, %xmm0
1319; SSE-NEXT:    movntps %xmm0, 48(%rdi)
1320; SSE-NEXT:    movntps %xmm0, 32(%rdi)
1321; SSE-NEXT:    movntps %xmm0, 16(%rdi)
1322; SSE-NEXT:    movntps %xmm0, (%rdi)
1323; SSE-NEXT:    retq
1324;
1325; AVX-LABEL: test_zero_v64i8_align32:
1326; AVX:       # %bb.0:
1327; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1328; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
1329; AVX-NEXT:    vmovntps %ymm0, (%rdi)
1330; AVX-NEXT:    vzeroupper
1331; AVX-NEXT:    retq
1332;
1333; AVX512-LABEL: test_zero_v64i8_align32:
1334; AVX512:       # %bb.0:
1335; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1336; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
1337; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
1338; AVX512-NEXT:    vzeroupper
1339; AVX512-NEXT:    retq
1340  store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1
1341  ret void
1342}
1343
1344!1 = !{i32 1}
1345