1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
8
9; Test codegen for under aligned nontemporal vector loads
10
11; XMM versions.
12
13define <2 x double> @test_v2f64_align1(<2 x double>* %src) nounwind {
14; SSE-LABEL: test_v2f64_align1:
15; SSE:       # %bb.0:
16; SSE-NEXT:    movups (%rdi), %xmm0
17; SSE-NEXT:    retq
18;
19; AVX-LABEL: test_v2f64_align1:
20; AVX:       # %bb.0:
21; AVX-NEXT:    vmovups (%rdi), %xmm0
22; AVX-NEXT:    retq
23  %1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1
24  ret <2 x double> %1
25}
26
27define <4 x float> @test_v4f32_align1(<4 x float>* %src) nounwind {
28; SSE-LABEL: test_v4f32_align1:
29; SSE:       # %bb.0:
30; SSE-NEXT:    movups (%rdi), %xmm0
31; SSE-NEXT:    retq
32;
33; AVX-LABEL: test_v4f32_align1:
34; AVX:       # %bb.0:
35; AVX-NEXT:    vmovups (%rdi), %xmm0
36; AVX-NEXT:    retq
37  %1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1
38  ret <4 x float> %1
39}
40
41define <2 x i64> @test_v2i64_align1(<2 x i64>* %src) nounwind {
42; SSE-LABEL: test_v2i64_align1:
43; SSE:       # %bb.0:
44; SSE-NEXT:    movups (%rdi), %xmm0
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: test_v2i64_align1:
48; AVX:       # %bb.0:
49; AVX-NEXT:    vmovups (%rdi), %xmm0
50; AVX-NEXT:    retq
51  %1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1
52  ret <2 x i64> %1
53}
54
55define <4 x i32> @test_v4i32_align1(<4 x i32>* %src) nounwind {
56; SSE-LABEL: test_v4i32_align1:
57; SSE:       # %bb.0:
58; SSE-NEXT:    movups (%rdi), %xmm0
59; SSE-NEXT:    retq
60;
61; AVX-LABEL: test_v4i32_align1:
62; AVX:       # %bb.0:
63; AVX-NEXT:    vmovups (%rdi), %xmm0
64; AVX-NEXT:    retq
65  %1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1
66  ret <4 x i32> %1
67}
68
69define <8 x i16> @test_v8i16_align1(<8 x i16>* %src) nounwind {
70; SSE-LABEL: test_v8i16_align1:
71; SSE:       # %bb.0:
72; SSE-NEXT:    movups (%rdi), %xmm0
73; SSE-NEXT:    retq
74;
75; AVX-LABEL: test_v8i16_align1:
76; AVX:       # %bb.0:
77; AVX-NEXT:    vmovups (%rdi), %xmm0
78; AVX-NEXT:    retq
79  %1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1
80  ret <8 x i16> %1
81}
82
83define <16 x i8> @test_v16i8_align1(<16 x i8>* %src) nounwind {
84; SSE-LABEL: test_v16i8_align1:
85; SSE:       # %bb.0:
86; SSE-NEXT:    movups (%rdi), %xmm0
87; SSE-NEXT:    retq
88;
89; AVX-LABEL: test_v16i8_align1:
90; AVX:       # %bb.0:
91; AVX-NEXT:    vmovups (%rdi), %xmm0
92; AVX-NEXT:    retq
93  %1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1
94  ret <16 x i8> %1
95}
96
97; YMM versions.
98
99define <4 x double> @test_v4f64_align1(<4 x double>* %src) nounwind {
100; SSE-LABEL: test_v4f64_align1:
101; SSE:       # %bb.0:
102; SSE-NEXT:    movups (%rdi), %xmm0
103; SSE-NEXT:    movups 16(%rdi), %xmm1
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: test_v4f64_align1:
107; AVX:       # %bb.0:
108; AVX-NEXT:    vmovups (%rdi), %ymm0
109; AVX-NEXT:    retq
110  %1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1
111  ret <4 x double> %1
112}
113
114define <8 x float> @test_v8f32_align1(<8 x float>* %src) nounwind {
115; SSE-LABEL: test_v8f32_align1:
116; SSE:       # %bb.0:
117; SSE-NEXT:    movups (%rdi), %xmm0
118; SSE-NEXT:    movups 16(%rdi), %xmm1
119; SSE-NEXT:    retq
120;
121; AVX-LABEL: test_v8f32_align1:
122; AVX:       # %bb.0:
123; AVX-NEXT:    vmovups (%rdi), %ymm0
124; AVX-NEXT:    retq
125  %1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1
126  ret <8 x float> %1
127}
128
129define <4 x i64> @test_v4i64_align1(<4 x i64>* %src) nounwind {
130; SSE-LABEL: test_v4i64_align1:
131; SSE:       # %bb.0:
132; SSE-NEXT:    movups (%rdi), %xmm0
133; SSE-NEXT:    movups 16(%rdi), %xmm1
134; SSE-NEXT:    retq
135;
136; AVX-LABEL: test_v4i64_align1:
137; AVX:       # %bb.0:
138; AVX-NEXT:    vmovups (%rdi), %ymm0
139; AVX-NEXT:    retq
140  %1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1
141  ret <4 x i64> %1
142}
143
144define <8 x i32> @test_v8i32_align1(<8 x i32>* %src) nounwind {
145; SSE-LABEL: test_v8i32_align1:
146; SSE:       # %bb.0:
147; SSE-NEXT:    movups (%rdi), %xmm0
148; SSE-NEXT:    movups 16(%rdi), %xmm1
149; SSE-NEXT:    retq
150;
151; AVX-LABEL: test_v8i32_align1:
152; AVX:       # %bb.0:
153; AVX-NEXT:    vmovups (%rdi), %ymm0
154; AVX-NEXT:    retq
155  %1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1
156  ret <8 x i32> %1
157}
158
159define <16 x i16> @test_v16i16_align1(<16 x i16>* %src) nounwind {
160; SSE-LABEL: test_v16i16_align1:
161; SSE:       # %bb.0:
162; SSE-NEXT:    movups (%rdi), %xmm0
163; SSE-NEXT:    movups 16(%rdi), %xmm1
164; SSE-NEXT:    retq
165;
166; AVX-LABEL: test_v16i16_align1:
167; AVX:       # %bb.0:
168; AVX-NEXT:    vmovups (%rdi), %ymm0
169; AVX-NEXT:    retq
170  %1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1
171  ret <16 x i16> %1
172}
173
174define <32 x i8> @test_v32i8_align1(<32 x i8>* %src) nounwind {
175; SSE-LABEL: test_v32i8_align1:
176; SSE:       # %bb.0:
177; SSE-NEXT:    movups (%rdi), %xmm0
178; SSE-NEXT:    movups 16(%rdi), %xmm1
179; SSE-NEXT:    retq
180;
181; AVX-LABEL: test_v32i8_align1:
182; AVX:       # %bb.0:
183; AVX-NEXT:    vmovups (%rdi), %ymm0
184; AVX-NEXT:    retq
185  %1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1
186  ret <32 x i8> %1
187}
188
189define <4 x double> @test_v4f64_align16(<4 x double>* %src) nounwind {
190; SSE2-LABEL: test_v4f64_align16:
191; SSE2:       # %bb.0:
192; SSE2-NEXT:    movaps (%rdi), %xmm0
193; SSE2-NEXT:    movaps 16(%rdi), %xmm1
194; SSE2-NEXT:    retq
195;
196; SSE41-LABEL: test_v4f64_align16:
197; SSE41:       # %bb.0:
198; SSE41-NEXT:    movntdqa (%rdi), %xmm0
199; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
200; SSE41-NEXT:    retq
201;
202; AVX-LABEL: test_v4f64_align16:
203; AVX:       # %bb.0:
204; AVX-NEXT:    pushq %rbp
205; AVX-NEXT:    movq %rsp, %rbp
206; AVX-NEXT:    andq $-32, %rsp
207; AVX-NEXT:    subq $64, %rsp
208; AVX-NEXT:    vmovntdqa 16(%rdi), %xmm0
209; AVX-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
210; AVX-NEXT:    vmovntdqa (%rdi), %xmm0
211; AVX-NEXT:    vmovdqa %xmm0, (%rsp)
212; AVX-NEXT:    vmovaps (%rsp), %ymm0
213; AVX-NEXT:    movq %rbp, %rsp
214; AVX-NEXT:    popq %rbp
215; AVX-NEXT:    retq
216  %1 = load <4 x double>, <4 x double>* %src, align 16, !nontemporal !1
217  ret <4 x double> %1
218}
219
220define <8 x float> @test_v8f32_align16(<8 x float>* %src) nounwind {
221; SSE2-LABEL: test_v8f32_align16:
222; SSE2:       # %bb.0:
223; SSE2-NEXT:    movaps (%rdi), %xmm0
224; SSE2-NEXT:    movaps 16(%rdi), %xmm1
225; SSE2-NEXT:    retq
226;
227; SSE41-LABEL: test_v8f32_align16:
228; SSE41:       # %bb.0:
229; SSE41-NEXT:    movntdqa (%rdi), %xmm0
230; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
231; SSE41-NEXT:    retq
232;
233; AVX-LABEL: test_v8f32_align16:
234; AVX:       # %bb.0:
235; AVX-NEXT:    pushq %rbp
236; AVX-NEXT:    movq %rsp, %rbp
237; AVX-NEXT:    andq $-32, %rsp
238; AVX-NEXT:    subq $64, %rsp
239; AVX-NEXT:    vmovntdqa 16(%rdi), %xmm0
240; AVX-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
241; AVX-NEXT:    vmovntdqa (%rdi), %xmm0
242; AVX-NEXT:    vmovdqa %xmm0, (%rsp)
243; AVX-NEXT:    vmovaps (%rsp), %ymm0
244; AVX-NEXT:    movq %rbp, %rsp
245; AVX-NEXT:    popq %rbp
246; AVX-NEXT:    retq
247  %1 = load <8 x float>, <8 x float>* %src, align 16, !nontemporal !1
248  ret <8 x float> %1
249}
250
251define <4 x i64> @test_v4i64_align16(<4 x i64>* %src) nounwind {
252; SSE2-LABEL: test_v4i64_align16:
253; SSE2:       # %bb.0:
254; SSE2-NEXT:    movaps (%rdi), %xmm0
255; SSE2-NEXT:    movaps 16(%rdi), %xmm1
256; SSE2-NEXT:    retq
257;
258; SSE41-LABEL: test_v4i64_align16:
259; SSE41:       # %bb.0:
260; SSE41-NEXT:    movntdqa (%rdi), %xmm0
261; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
262; SSE41-NEXT:    retq
263;
264; AVX-LABEL: test_v4i64_align16:
265; AVX:       # %bb.0:
266; AVX-NEXT:    pushq %rbp
267; AVX-NEXT:    movq %rsp, %rbp
268; AVX-NEXT:    andq $-32, %rsp
269; AVX-NEXT:    subq $64, %rsp
270; AVX-NEXT:    vmovntdqa 16(%rdi), %xmm0
271; AVX-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
272; AVX-NEXT:    vmovntdqa (%rdi), %xmm0
273; AVX-NEXT:    vmovdqa %xmm0, (%rsp)
274; AVX-NEXT:    vmovaps (%rsp), %ymm0
275; AVX-NEXT:    movq %rbp, %rsp
276; AVX-NEXT:    popq %rbp
277; AVX-NEXT:    retq
278  %1 = load <4 x i64>, <4 x i64>* %src, align 16, !nontemporal !1
279  ret <4 x i64> %1
280}
281
282define <8 x i32> @test_v8i32_align16(<8 x i32>* %src) nounwind {
283; SSE2-LABEL: test_v8i32_align16:
284; SSE2:       # %bb.0:
285; SSE2-NEXT:    movaps (%rdi), %xmm0
286; SSE2-NEXT:    movaps 16(%rdi), %xmm1
287; SSE2-NEXT:    retq
288;
289; SSE41-LABEL: test_v8i32_align16:
290; SSE41:       # %bb.0:
291; SSE41-NEXT:    movntdqa (%rdi), %xmm0
292; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
293; SSE41-NEXT:    retq
294;
295; AVX-LABEL: test_v8i32_align16:
296; AVX:       # %bb.0:
297; AVX-NEXT:    pushq %rbp
298; AVX-NEXT:    movq %rsp, %rbp
299; AVX-NEXT:    andq $-32, %rsp
300; AVX-NEXT:    subq $64, %rsp
301; AVX-NEXT:    vmovntdqa 16(%rdi), %xmm0
302; AVX-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
303; AVX-NEXT:    vmovntdqa (%rdi), %xmm0
304; AVX-NEXT:    vmovdqa %xmm0, (%rsp)
305; AVX-NEXT:    vmovaps (%rsp), %ymm0
306; AVX-NEXT:    movq %rbp, %rsp
307; AVX-NEXT:    popq %rbp
308; AVX-NEXT:    retq
309  %1 = load <8 x i32>, <8 x i32>* %src, align 16, !nontemporal !1
310  ret <8 x i32> %1
311}
312
313define <16 x i16> @test_v16i16_align16(<16 x i16>* %src) nounwind {
314; SSE2-LABEL: test_v16i16_align16:
315; SSE2:       # %bb.0:
316; SSE2-NEXT:    movaps (%rdi), %xmm0
317; SSE2-NEXT:    movaps 16(%rdi), %xmm1
318; SSE2-NEXT:    retq
319;
320; SSE41-LABEL: test_v16i16_align16:
321; SSE41:       # %bb.0:
322; SSE41-NEXT:    movntdqa (%rdi), %xmm0
323; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
324; SSE41-NEXT:    retq
325;
326; AVX-LABEL: test_v16i16_align16:
327; AVX:       # %bb.0:
328; AVX-NEXT:    pushq %rbp
329; AVX-NEXT:    movq %rsp, %rbp
330; AVX-NEXT:    andq $-32, %rsp
331; AVX-NEXT:    subq $64, %rsp
332; AVX-NEXT:    vmovntdqa 16(%rdi), %xmm0
333; AVX-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
334; AVX-NEXT:    vmovntdqa (%rdi), %xmm0
335; AVX-NEXT:    vmovdqa %xmm0, (%rsp)
336; AVX-NEXT:    vmovaps (%rsp), %ymm0
337; AVX-NEXT:    movq %rbp, %rsp
338; AVX-NEXT:    popq %rbp
339; AVX-NEXT:    retq
340  %1 = load <16 x i16>, <16 x i16>* %src, align 16, !nontemporal !1
341  ret <16 x i16> %1
342}
343
344define <32 x i8> @test_v32i8_align16(<32 x i8>* %src) nounwind {
345; SSE2-LABEL: test_v32i8_align16:
346; SSE2:       # %bb.0:
347; SSE2-NEXT:    movaps (%rdi), %xmm0
348; SSE2-NEXT:    movaps 16(%rdi), %xmm1
349; SSE2-NEXT:    retq
350;
351; SSE41-LABEL: test_v32i8_align16:
352; SSE41:       # %bb.0:
353; SSE41-NEXT:    movntdqa (%rdi), %xmm0
354; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
355; SSE41-NEXT:    retq
356;
357; AVX-LABEL: test_v32i8_align16:
358; AVX:       # %bb.0:
359; AVX-NEXT:    pushq %rbp
360; AVX-NEXT:    movq %rsp, %rbp
361; AVX-NEXT:    andq $-32, %rsp
362; AVX-NEXT:    subq $64, %rsp
363; AVX-NEXT:    vmovntdqa 16(%rdi), %xmm0
364; AVX-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
365; AVX-NEXT:    vmovntdqa (%rdi), %xmm0
366; AVX-NEXT:    vmovdqa %xmm0, (%rsp)
367; AVX-NEXT:    vmovaps (%rsp), %ymm0
368; AVX-NEXT:    movq %rbp, %rsp
369; AVX-NEXT:    popq %rbp
370; AVX-NEXT:    retq
371  %1 = load <32 x i8>, <32 x i8>* %src, align 16, !nontemporal !1
372  ret <32 x i8> %1
373}
374
375; ZMM versions.
376
377define <8 x double> @test_v8f64_align1(<8 x double>* %src) nounwind {
378; SSE-LABEL: test_v8f64_align1:
379; SSE:       # %bb.0:
380; SSE-NEXT:    movups (%rdi), %xmm0
381; SSE-NEXT:    movups 16(%rdi), %xmm1
382; SSE-NEXT:    movups 32(%rdi), %xmm2
383; SSE-NEXT:    movups 48(%rdi), %xmm3
384; SSE-NEXT:    retq
385;
386; AVX1-LABEL: test_v8f64_align1:
387; AVX1:       # %bb.0:
388; AVX1-NEXT:    vmovups (%rdi), %ymm0
389; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
390; AVX1-NEXT:    retq
391;
392; AVX2-LABEL: test_v8f64_align1:
393; AVX2:       # %bb.0:
394; AVX2-NEXT:    vmovups (%rdi), %ymm0
395; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
396; AVX2-NEXT:    retq
397;
398; AVX512-LABEL: test_v8f64_align1:
399; AVX512:       # %bb.0:
400; AVX512-NEXT:    vmovups (%rdi), %zmm0
401; AVX512-NEXT:    retq
402  %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1
403  ret <8 x double> %1
404}
405
406define <16 x float> @test_v16f32_align1(<16 x float>* %src) nounwind {
407; SSE-LABEL: test_v16f32_align1:
408; SSE:       # %bb.0:
409; SSE-NEXT:    movups (%rdi), %xmm0
410; SSE-NEXT:    movups 16(%rdi), %xmm1
411; SSE-NEXT:    movups 32(%rdi), %xmm2
412; SSE-NEXT:    movups 48(%rdi), %xmm3
413; SSE-NEXT:    retq
414;
415; AVX1-LABEL: test_v16f32_align1:
416; AVX1:       # %bb.0:
417; AVX1-NEXT:    vmovups (%rdi), %ymm0
418; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
419; AVX1-NEXT:    retq
420;
421; AVX2-LABEL: test_v16f32_align1:
422; AVX2:       # %bb.0:
423; AVX2-NEXT:    vmovups (%rdi), %ymm0
424; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
425; AVX2-NEXT:    retq
426;
427; AVX512-LABEL: test_v16f32_align1:
428; AVX512:       # %bb.0:
429; AVX512-NEXT:    vmovups (%rdi), %zmm0
430; AVX512-NEXT:    retq
431  %1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1
432  ret <16 x float> %1
433}
434
435define <8 x i64> @test_v8i64_align1(<8 x i64>* %src) nounwind {
436; SSE-LABEL: test_v8i64_align1:
437; SSE:       # %bb.0:
438; SSE-NEXT:    movups (%rdi), %xmm0
439; SSE-NEXT:    movups 16(%rdi), %xmm1
440; SSE-NEXT:    movups 32(%rdi), %xmm2
441; SSE-NEXT:    movups 48(%rdi), %xmm3
442; SSE-NEXT:    retq
443;
444; AVX1-LABEL: test_v8i64_align1:
445; AVX1:       # %bb.0:
446; AVX1-NEXT:    vmovups (%rdi), %ymm0
447; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
448; AVX1-NEXT:    retq
449;
450; AVX2-LABEL: test_v8i64_align1:
451; AVX2:       # %bb.0:
452; AVX2-NEXT:    vmovups (%rdi), %ymm0
453; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
454; AVX2-NEXT:    retq
455;
456; AVX512-LABEL: test_v8i64_align1:
457; AVX512:       # %bb.0:
458; AVX512-NEXT:    vmovups (%rdi), %zmm0
459; AVX512-NEXT:    retq
460  %1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1
461  ret <8 x i64> %1
462}
463
464define <16 x i32> @test_v16i32_align1(<16 x i32>* %src) nounwind {
465; SSE-LABEL: test_v16i32_align1:
466; SSE:       # %bb.0:
467; SSE-NEXT:    movups (%rdi), %xmm0
468; SSE-NEXT:    movups 16(%rdi), %xmm1
469; SSE-NEXT:    movups 32(%rdi), %xmm2
470; SSE-NEXT:    movups 48(%rdi), %xmm3
471; SSE-NEXT:    retq
472;
473; AVX1-LABEL: test_v16i32_align1:
474; AVX1:       # %bb.0:
475; AVX1-NEXT:    vmovups (%rdi), %ymm0
476; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
477; AVX1-NEXT:    retq
478;
479; AVX2-LABEL: test_v16i32_align1:
480; AVX2:       # %bb.0:
481; AVX2-NEXT:    vmovups (%rdi), %ymm0
482; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
483; AVX2-NEXT:    retq
484;
485; AVX512-LABEL: test_v16i32_align1:
486; AVX512:       # %bb.0:
487; AVX512-NEXT:    vmovups (%rdi), %zmm0
488; AVX512-NEXT:    retq
489  %1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1
490  ret <16 x i32> %1
491}
492
493define <32 x i16> @test_v32i16_align1(<32 x i16>* %src) nounwind {
494; SSE-LABEL: test_v32i16_align1:
495; SSE:       # %bb.0:
496; SSE-NEXT:    movups (%rdi), %xmm0
497; SSE-NEXT:    movups 16(%rdi), %xmm1
498; SSE-NEXT:    movups 32(%rdi), %xmm2
499; SSE-NEXT:    movups 48(%rdi), %xmm3
500; SSE-NEXT:    retq
501;
502; AVX1-LABEL: test_v32i16_align1:
503; AVX1:       # %bb.0:
504; AVX1-NEXT:    vmovups (%rdi), %ymm0
505; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
506; AVX1-NEXT:    retq
507;
508; AVX2-LABEL: test_v32i16_align1:
509; AVX2:       # %bb.0:
510; AVX2-NEXT:    vmovups (%rdi), %ymm0
511; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
512; AVX2-NEXT:    retq
513;
514; AVX512-LABEL: test_v32i16_align1:
515; AVX512:       # %bb.0:
516; AVX512-NEXT:    vmovups (%rdi), %zmm0
517; AVX512-NEXT:    retq
518  %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1
519  ret <32 x i16> %1
520}
521
522define <64 x i8> @test_v64i8_align1(<64 x i8>* %src) nounwind {
523; SSE-LABEL: test_v64i8_align1:
524; SSE:       # %bb.0:
525; SSE-NEXT:    movups (%rdi), %xmm0
526; SSE-NEXT:    movups 16(%rdi), %xmm1
527; SSE-NEXT:    movups 32(%rdi), %xmm2
528; SSE-NEXT:    movups 48(%rdi), %xmm3
529; SSE-NEXT:    retq
530;
531; AVX1-LABEL: test_v64i8_align1:
532; AVX1:       # %bb.0:
533; AVX1-NEXT:    vmovups (%rdi), %ymm0
534; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
535; AVX1-NEXT:    retq
536;
537; AVX2-LABEL: test_v64i8_align1:
538; AVX2:       # %bb.0:
539; AVX2-NEXT:    vmovups (%rdi), %ymm0
540; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
541; AVX2-NEXT:    retq
542;
543; AVX512-LABEL: test_v64i8_align1:
544; AVX512:       # %bb.0:
545; AVX512-NEXT:    vmovups (%rdi), %zmm0
546; AVX512-NEXT:    retq
547  %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1
548  ret <64 x i8> %1
549}
550
551define <8 x double> @test_v8f64_align16(<8 x double>* %src) nounwind {
552; SSE2-LABEL: test_v8f64_align16:
553; SSE2:       # %bb.0:
554; SSE2-NEXT:    movaps (%rdi), %xmm0
555; SSE2-NEXT:    movaps 16(%rdi), %xmm1
556; SSE2-NEXT:    movaps 32(%rdi), %xmm2
557; SSE2-NEXT:    movaps 48(%rdi), %xmm3
558; SSE2-NEXT:    retq
559;
560; SSE41-LABEL: test_v8f64_align16:
561; SSE41:       # %bb.0:
562; SSE41-NEXT:    movntdqa (%rdi), %xmm0
563; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
564; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
565; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
566; SSE41-NEXT:    retq
567;
568; AVX1-LABEL: test_v8f64_align16:
569; AVX1:       # %bb.0:
570; AVX1-NEXT:    pushq %rbp
571; AVX1-NEXT:    movq %rsp, %rbp
572; AVX1-NEXT:    andq $-32, %rsp
573; AVX1-NEXT:    subq $96, %rsp
574; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
575; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
576; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
577; AVX1-NEXT:    vmovdqa %xmm0, (%rsp)
578; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm0
579; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
580; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm0
581; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
582; AVX1-NEXT:    vmovaps (%rsp), %ymm0
583; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
584; AVX1-NEXT:    movq %rbp, %rsp
585; AVX1-NEXT:    popq %rbp
586; AVX1-NEXT:    retq
587;
588; AVX2-LABEL: test_v8f64_align16:
589; AVX2:       # %bb.0:
590; AVX2-NEXT:    pushq %rbp
591; AVX2-NEXT:    movq %rsp, %rbp
592; AVX2-NEXT:    andq $-32, %rsp
593; AVX2-NEXT:    subq $96, %rsp
594; AVX2-NEXT:    vmovntdqa 16(%rdi), %xmm0
595; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
596; AVX2-NEXT:    vmovntdqa (%rdi), %xmm0
597; AVX2-NEXT:    vmovdqa %xmm0, (%rsp)
598; AVX2-NEXT:    vmovntdqa 48(%rdi), %xmm0
599; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
600; AVX2-NEXT:    vmovntdqa 32(%rdi), %xmm0
601; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
602; AVX2-NEXT:    vmovaps (%rsp), %ymm0
603; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
604; AVX2-NEXT:    movq %rbp, %rsp
605; AVX2-NEXT:    popq %rbp
606; AVX2-NEXT:    retq
607;
608; AVX512-LABEL: test_v8f64_align16:
609; AVX512:       # %bb.0:
610; AVX512-NEXT:    pushq %rbp
611; AVX512-NEXT:    movq %rsp, %rbp
612; AVX512-NEXT:    andq $-64, %rsp
613; AVX512-NEXT:    subq $128, %rsp
614; AVX512-NEXT:    vmovntdqa 48(%rdi), %xmm0
615; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
616; AVX512-NEXT:    vmovntdqa 32(%rdi), %xmm0
617; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
618; AVX512-NEXT:    vmovntdqa 16(%rdi), %xmm0
619; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
620; AVX512-NEXT:    vmovntdqa (%rdi), %xmm0
621; AVX512-NEXT:    vmovdqa %xmm0, (%rsp)
622; AVX512-NEXT:    vmovaps (%rsp), %zmm0
623; AVX512-NEXT:    movq %rbp, %rsp
624; AVX512-NEXT:    popq %rbp
625; AVX512-NEXT:    retq
626  %1 = load <8 x double>, <8 x double>* %src, align 16, !nontemporal !1
627  ret <8 x double> %1
628}
629
630define <16 x float> @test_v16f32_align16(<16 x float>* %src) nounwind {
631; SSE2-LABEL: test_v16f32_align16:
632; SSE2:       # %bb.0:
633; SSE2-NEXT:    movaps (%rdi), %xmm0
634; SSE2-NEXT:    movaps 16(%rdi), %xmm1
635; SSE2-NEXT:    movaps 32(%rdi), %xmm2
636; SSE2-NEXT:    movaps 48(%rdi), %xmm3
637; SSE2-NEXT:    retq
638;
639; SSE41-LABEL: test_v16f32_align16:
640; SSE41:       # %bb.0:
641; SSE41-NEXT:    movntdqa (%rdi), %xmm0
642; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
643; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
644; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
645; SSE41-NEXT:    retq
646;
647; AVX1-LABEL: test_v16f32_align16:
648; AVX1:       # %bb.0:
649; AVX1-NEXT:    pushq %rbp
650; AVX1-NEXT:    movq %rsp, %rbp
651; AVX1-NEXT:    andq $-32, %rsp
652; AVX1-NEXT:    subq $96, %rsp
653; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
654; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
655; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
656; AVX1-NEXT:    vmovdqa %xmm0, (%rsp)
657; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm0
658; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
659; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm0
660; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
661; AVX1-NEXT:    vmovaps (%rsp), %ymm0
662; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
663; AVX1-NEXT:    movq %rbp, %rsp
664; AVX1-NEXT:    popq %rbp
665; AVX1-NEXT:    retq
666;
667; AVX2-LABEL: test_v16f32_align16:
668; AVX2:       # %bb.0:
669; AVX2-NEXT:    pushq %rbp
670; AVX2-NEXT:    movq %rsp, %rbp
671; AVX2-NEXT:    andq $-32, %rsp
672; AVX2-NEXT:    subq $96, %rsp
673; AVX2-NEXT:    vmovntdqa 16(%rdi), %xmm0
674; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
675; AVX2-NEXT:    vmovntdqa (%rdi), %xmm0
676; AVX2-NEXT:    vmovdqa %xmm0, (%rsp)
677; AVX2-NEXT:    vmovntdqa 48(%rdi), %xmm0
678; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
679; AVX2-NEXT:    vmovntdqa 32(%rdi), %xmm0
680; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
681; AVX2-NEXT:    vmovaps (%rsp), %ymm0
682; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
683; AVX2-NEXT:    movq %rbp, %rsp
684; AVX2-NEXT:    popq %rbp
685; AVX2-NEXT:    retq
686;
687; AVX512-LABEL: test_v16f32_align16:
688; AVX512:       # %bb.0:
689; AVX512-NEXT:    pushq %rbp
690; AVX512-NEXT:    movq %rsp, %rbp
691; AVX512-NEXT:    andq $-64, %rsp
692; AVX512-NEXT:    subq $128, %rsp
693; AVX512-NEXT:    vmovntdqa 48(%rdi), %xmm0
694; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
695; AVX512-NEXT:    vmovntdqa 32(%rdi), %xmm0
696; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
697; AVX512-NEXT:    vmovntdqa 16(%rdi), %xmm0
698; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
699; AVX512-NEXT:    vmovntdqa (%rdi), %xmm0
700; AVX512-NEXT:    vmovdqa %xmm0, (%rsp)
701; AVX512-NEXT:    vmovaps (%rsp), %zmm0
702; AVX512-NEXT:    movq %rbp, %rsp
703; AVX512-NEXT:    popq %rbp
704; AVX512-NEXT:    retq
705  %1 = load <16 x float>, <16 x float>* %src, align 16, !nontemporal !1
706  ret <16 x float> %1
707}
708
709define <8 x i64> @test_v8i64_align16(<8 x i64>* %src) nounwind {
710; SSE2-LABEL: test_v8i64_align16:
711; SSE2:       # %bb.0:
712; SSE2-NEXT:    movaps (%rdi), %xmm0
713; SSE2-NEXT:    movaps 16(%rdi), %xmm1
714; SSE2-NEXT:    movaps 32(%rdi), %xmm2
715; SSE2-NEXT:    movaps 48(%rdi), %xmm3
716; SSE2-NEXT:    retq
717;
718; SSE41-LABEL: test_v8i64_align16:
719; SSE41:       # %bb.0:
720; SSE41-NEXT:    movntdqa (%rdi), %xmm0
721; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
722; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
723; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
724; SSE41-NEXT:    retq
725;
726; AVX1-LABEL: test_v8i64_align16:
727; AVX1:       # %bb.0:
728; AVX1-NEXT:    pushq %rbp
729; AVX1-NEXT:    movq %rsp, %rbp
730; AVX1-NEXT:    andq $-32, %rsp
731; AVX1-NEXT:    subq $96, %rsp
732; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
733; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
734; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
735; AVX1-NEXT:    vmovdqa %xmm0, (%rsp)
736; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm0
737; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
738; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm0
739; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
740; AVX1-NEXT:    vmovaps (%rsp), %ymm0
741; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
742; AVX1-NEXT:    movq %rbp, %rsp
743; AVX1-NEXT:    popq %rbp
744; AVX1-NEXT:    retq
745;
746; AVX2-LABEL: test_v8i64_align16:
747; AVX2:       # %bb.0:
748; AVX2-NEXT:    pushq %rbp
749; AVX2-NEXT:    movq %rsp, %rbp
750; AVX2-NEXT:    andq $-32, %rsp
751; AVX2-NEXT:    subq $96, %rsp
752; AVX2-NEXT:    vmovntdqa 16(%rdi), %xmm0
753; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
754; AVX2-NEXT:    vmovntdqa (%rdi), %xmm0
755; AVX2-NEXT:    vmovdqa %xmm0, (%rsp)
756; AVX2-NEXT:    vmovntdqa 48(%rdi), %xmm0
757; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
758; AVX2-NEXT:    vmovntdqa 32(%rdi), %xmm0
759; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
760; AVX2-NEXT:    vmovaps (%rsp), %ymm0
761; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
762; AVX2-NEXT:    movq %rbp, %rsp
763; AVX2-NEXT:    popq %rbp
764; AVX2-NEXT:    retq
765;
766; AVX512-LABEL: test_v8i64_align16:
767; AVX512:       # %bb.0:
768; AVX512-NEXT:    pushq %rbp
769; AVX512-NEXT:    movq %rsp, %rbp
770; AVX512-NEXT:    andq $-64, %rsp
771; AVX512-NEXT:    subq $128, %rsp
772; AVX512-NEXT:    vmovntdqa 48(%rdi), %xmm0
773; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
774; AVX512-NEXT:    vmovntdqa 32(%rdi), %xmm0
775; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
776; AVX512-NEXT:    vmovntdqa 16(%rdi), %xmm0
777; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
778; AVX512-NEXT:    vmovntdqa (%rdi), %xmm0
779; AVX512-NEXT:    vmovdqa %xmm0, (%rsp)
780; AVX512-NEXT:    vmovaps (%rsp), %zmm0
781; AVX512-NEXT:    movq %rbp, %rsp
782; AVX512-NEXT:    popq %rbp
783; AVX512-NEXT:    retq
784  %1 = load <8 x i64>, <8 x i64>* %src, align 16, !nontemporal !1
785  ret <8 x i64> %1
786}
787
788define <16 x i32> @test_v16i32_align16(<16 x i32>* %src) nounwind {
789; SSE2-LABEL: test_v16i32_align16:
790; SSE2:       # %bb.0:
791; SSE2-NEXT:    movaps (%rdi), %xmm0
792; SSE2-NEXT:    movaps 16(%rdi), %xmm1
793; SSE2-NEXT:    movaps 32(%rdi), %xmm2
794; SSE2-NEXT:    movaps 48(%rdi), %xmm3
795; SSE2-NEXT:    retq
796;
797; SSE41-LABEL: test_v16i32_align16:
798; SSE41:       # %bb.0:
799; SSE41-NEXT:    movntdqa (%rdi), %xmm0
800; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
801; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
802; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
803; SSE41-NEXT:    retq
804;
805; AVX1-LABEL: test_v16i32_align16:
806; AVX1:       # %bb.0:
807; AVX1-NEXT:    pushq %rbp
808; AVX1-NEXT:    movq %rsp, %rbp
809; AVX1-NEXT:    andq $-32, %rsp
810; AVX1-NEXT:    subq $96, %rsp
811; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
812; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
813; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
814; AVX1-NEXT:    vmovdqa %xmm0, (%rsp)
815; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm0
816; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
817; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm0
818; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
819; AVX1-NEXT:    vmovaps (%rsp), %ymm0
820; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
821; AVX1-NEXT:    movq %rbp, %rsp
822; AVX1-NEXT:    popq %rbp
823; AVX1-NEXT:    retq
824;
825; AVX2-LABEL: test_v16i32_align16:
826; AVX2:       # %bb.0:
827; AVX2-NEXT:    pushq %rbp
828; AVX2-NEXT:    movq %rsp, %rbp
829; AVX2-NEXT:    andq $-32, %rsp
830; AVX2-NEXT:    subq $96, %rsp
831; AVX2-NEXT:    vmovntdqa 16(%rdi), %xmm0
832; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
833; AVX2-NEXT:    vmovntdqa (%rdi), %xmm0
834; AVX2-NEXT:    vmovdqa %xmm0, (%rsp)
835; AVX2-NEXT:    vmovntdqa 48(%rdi), %xmm0
836; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
837; AVX2-NEXT:    vmovntdqa 32(%rdi), %xmm0
838; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
839; AVX2-NEXT:    vmovaps (%rsp), %ymm0
840; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
841; AVX2-NEXT:    movq %rbp, %rsp
842; AVX2-NEXT:    popq %rbp
843; AVX2-NEXT:    retq
844;
845; AVX512-LABEL: test_v16i32_align16:
846; AVX512:       # %bb.0:
847; AVX512-NEXT:    pushq %rbp
848; AVX512-NEXT:    movq %rsp, %rbp
849; AVX512-NEXT:    andq $-64, %rsp
850; AVX512-NEXT:    subq $128, %rsp
851; AVX512-NEXT:    vmovntdqa 48(%rdi), %xmm0
852; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
853; AVX512-NEXT:    vmovntdqa 32(%rdi), %xmm0
854; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
855; AVX512-NEXT:    vmovntdqa 16(%rdi), %xmm0
856; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
857; AVX512-NEXT:    vmovntdqa (%rdi), %xmm0
858; AVX512-NEXT:    vmovdqa %xmm0, (%rsp)
859; AVX512-NEXT:    vmovaps (%rsp), %zmm0
860; AVX512-NEXT:    movq %rbp, %rsp
861; AVX512-NEXT:    popq %rbp
862; AVX512-NEXT:    retq
863  %1 = load <16 x i32>, <16 x i32>* %src, align 16, !nontemporal !1
864  ret <16 x i32> %1
865}
866
867define <32 x i16> @test_v32i16_align16(<32 x i16>* %src) nounwind {
868; SSE2-LABEL: test_v32i16_align16:
869; SSE2:       # %bb.0:
870; SSE2-NEXT:    movaps (%rdi), %xmm0
871; SSE2-NEXT:    movaps 16(%rdi), %xmm1
872; SSE2-NEXT:    movaps 32(%rdi), %xmm2
873; SSE2-NEXT:    movaps 48(%rdi), %xmm3
874; SSE2-NEXT:    retq
875;
876; SSE41-LABEL: test_v32i16_align16:
877; SSE41:       # %bb.0:
878; SSE41-NEXT:    movntdqa (%rdi), %xmm0
879; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
880; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
881; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
882; SSE41-NEXT:    retq
883;
884; AVX1-LABEL: test_v32i16_align16:
885; AVX1:       # %bb.0:
886; AVX1-NEXT:    pushq %rbp
887; AVX1-NEXT:    movq %rsp, %rbp
888; AVX1-NEXT:    andq $-32, %rsp
889; AVX1-NEXT:    subq $96, %rsp
890; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
891; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
892; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
893; AVX1-NEXT:    vmovdqa %xmm0, (%rsp)
894; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm0
895; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
896; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm0
897; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
898; AVX1-NEXT:    vmovaps (%rsp), %ymm0
899; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
900; AVX1-NEXT:    movq %rbp, %rsp
901; AVX1-NEXT:    popq %rbp
902; AVX1-NEXT:    retq
903;
904; AVX2-LABEL: test_v32i16_align16:
905; AVX2:       # %bb.0:
906; AVX2-NEXT:    pushq %rbp
907; AVX2-NEXT:    movq %rsp, %rbp
908; AVX2-NEXT:    andq $-32, %rsp
909; AVX2-NEXT:    subq $96, %rsp
910; AVX2-NEXT:    vmovntdqa 16(%rdi), %xmm0
911; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
912; AVX2-NEXT:    vmovntdqa (%rdi), %xmm0
913; AVX2-NEXT:    vmovdqa %xmm0, (%rsp)
914; AVX2-NEXT:    vmovntdqa 48(%rdi), %xmm0
915; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
916; AVX2-NEXT:    vmovntdqa 32(%rdi), %xmm0
917; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
918; AVX2-NEXT:    vmovaps (%rsp), %ymm0
919; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
920; AVX2-NEXT:    movq %rbp, %rsp
921; AVX2-NEXT:    popq %rbp
922; AVX2-NEXT:    retq
923;
924; AVX512-LABEL: test_v32i16_align16:
925; AVX512:       # %bb.0:
926; AVX512-NEXT:    pushq %rbp
927; AVX512-NEXT:    movq %rsp, %rbp
928; AVX512-NEXT:    andq $-64, %rsp
929; AVX512-NEXT:    subq $128, %rsp
930; AVX512-NEXT:    vmovntdqa 48(%rdi), %xmm0
931; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
932; AVX512-NEXT:    vmovntdqa 32(%rdi), %xmm0
933; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
934; AVX512-NEXT:    vmovntdqa 16(%rdi), %xmm0
935; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
936; AVX512-NEXT:    vmovntdqa (%rdi), %xmm0
937; AVX512-NEXT:    vmovdqa %xmm0, (%rsp)
938; AVX512-NEXT:    vmovaps (%rsp), %zmm0
939; AVX512-NEXT:    movq %rbp, %rsp
940; AVX512-NEXT:    popq %rbp
941; AVX512-NEXT:    retq
942  %1 = load <32 x i16>, <32 x i16>* %src, align 16, !nontemporal !1
943  ret <32 x i16> %1
944}
945
946define <64 x i8> @test_v64i8_align16(<64 x i8>* %src) nounwind {
947; SSE2-LABEL: test_v64i8_align16:
948; SSE2:       # %bb.0:
949; SSE2-NEXT:    movaps (%rdi), %xmm0
950; SSE2-NEXT:    movaps 16(%rdi), %xmm1
951; SSE2-NEXT:    movaps 32(%rdi), %xmm2
952; SSE2-NEXT:    movaps 48(%rdi), %xmm3
953; SSE2-NEXT:    retq
954;
955; SSE41-LABEL: test_v64i8_align16:
956; SSE41:       # %bb.0:
957; SSE41-NEXT:    movntdqa (%rdi), %xmm0
958; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
959; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
960; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
961; SSE41-NEXT:    retq
962;
963; AVX1-LABEL: test_v64i8_align16:
964; AVX1:       # %bb.0:
965; AVX1-NEXT:    pushq %rbp
966; AVX1-NEXT:    movq %rsp, %rbp
967; AVX1-NEXT:    andq $-32, %rsp
968; AVX1-NEXT:    subq $96, %rsp
969; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
970; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
971; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
972; AVX1-NEXT:    vmovdqa %xmm0, (%rsp)
973; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm0
974; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
975; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm0
976; AVX1-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
977; AVX1-NEXT:    vmovaps (%rsp), %ymm0
978; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
979; AVX1-NEXT:    movq %rbp, %rsp
980; AVX1-NEXT:    popq %rbp
981; AVX1-NEXT:    retq
982;
983; AVX2-LABEL: test_v64i8_align16:
984; AVX2:       # %bb.0:
985; AVX2-NEXT:    pushq %rbp
986; AVX2-NEXT:    movq %rsp, %rbp
987; AVX2-NEXT:    andq $-32, %rsp
988; AVX2-NEXT:    subq $96, %rsp
989; AVX2-NEXT:    vmovntdqa 16(%rdi), %xmm0
990; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
991; AVX2-NEXT:    vmovntdqa (%rdi), %xmm0
992; AVX2-NEXT:    vmovdqa %xmm0, (%rsp)
993; AVX2-NEXT:    vmovntdqa 48(%rdi), %xmm0
994; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
995; AVX2-NEXT:    vmovntdqa 32(%rdi), %xmm0
996; AVX2-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
997; AVX2-NEXT:    vmovaps (%rsp), %ymm0
998; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
999; AVX2-NEXT:    movq %rbp, %rsp
1000; AVX2-NEXT:    popq %rbp
1001; AVX2-NEXT:    retq
1002;
1003; AVX512-LABEL: test_v64i8_align16:
1004; AVX512:       # %bb.0:
1005; AVX512-NEXT:    pushq %rbp
1006; AVX512-NEXT:    movq %rsp, %rbp
1007; AVX512-NEXT:    andq $-64, %rsp
1008; AVX512-NEXT:    subq $128, %rsp
1009; AVX512-NEXT:    vmovntdqa 48(%rdi), %xmm0
1010; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1011; AVX512-NEXT:    vmovntdqa 32(%rdi), %xmm0
1012; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1013; AVX512-NEXT:    vmovntdqa 16(%rdi), %xmm0
1014; AVX512-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1015; AVX512-NEXT:    vmovntdqa (%rdi), %xmm0
1016; AVX512-NEXT:    vmovdqa %xmm0, (%rsp)
1017; AVX512-NEXT:    vmovaps (%rsp), %zmm0
1018; AVX512-NEXT:    movq %rbp, %rsp
1019; AVX512-NEXT:    popq %rbp
1020; AVX512-NEXT:    retq
1021  %1 = load <64 x i8>, <64 x i8>* %src, align 16, !nontemporal !1
1022  ret <64 x i8> %1
1023}
1024
1025define <8 x double> @test_v8f64_align32(<8 x double>* %src) nounwind {
1026; SSE2-LABEL: test_v8f64_align32:
1027; SSE2:       # %bb.0:
1028; SSE2-NEXT:    movaps (%rdi), %xmm0
1029; SSE2-NEXT:    movaps 16(%rdi), %xmm1
1030; SSE2-NEXT:    movaps 32(%rdi), %xmm2
1031; SSE2-NEXT:    movaps 48(%rdi), %xmm3
1032; SSE2-NEXT:    retq
1033;
1034; SSE41-LABEL: test_v8f64_align32:
1035; SSE41:       # %bb.0:
1036; SSE41-NEXT:    movntdqa (%rdi), %xmm0
1037; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
1038; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
1039; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
1040; SSE41-NEXT:    retq
1041;
1042; AVX1-LABEL: test_v8f64_align32:
1043; AVX1:       # %bb.0:
1044; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
1045; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
1046; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1047; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
1048; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
1049; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1050; AVX1-NEXT:    retq
1051;
1052; AVX2-LABEL: test_v8f64_align32:
1053; AVX2:       # %bb.0:
1054; AVX2-NEXT:    vmovntdqa (%rdi), %ymm0
1055; AVX2-NEXT:    vmovntdqa 32(%rdi), %ymm1
1056; AVX2-NEXT:    retq
1057;
1058; AVX512-LABEL: test_v8f64_align32:
1059; AVX512:       # %bb.0:
1060; AVX512-NEXT:    pushq %rbp
1061; AVX512-NEXT:    movq %rsp, %rbp
1062; AVX512-NEXT:    andq $-64, %rsp
1063; AVX512-NEXT:    subq $128, %rsp
1064; AVX512-NEXT:    vmovntdqa 32(%rdi), %ymm0
1065; AVX512-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1066; AVX512-NEXT:    vmovntdqa (%rdi), %ymm0
1067; AVX512-NEXT:    vmovdqa %ymm0, (%rsp)
1068; AVX512-NEXT:    vmovaps (%rsp), %zmm0
1069; AVX512-NEXT:    movq %rbp, %rsp
1070; AVX512-NEXT:    popq %rbp
1071; AVX512-NEXT:    retq
1072  %1 = load <8 x double>, <8 x double>* %src, align 32, !nontemporal !1
1073  ret <8 x double> %1
1074}
1075
1076define <16 x float> @test_v16f32_align32(<16 x float>* %src) nounwind {
1077; SSE2-LABEL: test_v16f32_align32:
1078; SSE2:       # %bb.0:
1079; SSE2-NEXT:    movaps (%rdi), %xmm0
1080; SSE2-NEXT:    movaps 16(%rdi), %xmm1
1081; SSE2-NEXT:    movaps 32(%rdi), %xmm2
1082; SSE2-NEXT:    movaps 48(%rdi), %xmm3
1083; SSE2-NEXT:    retq
1084;
1085; SSE41-LABEL: test_v16f32_align32:
1086; SSE41:       # %bb.0:
1087; SSE41-NEXT:    movntdqa (%rdi), %xmm0
1088; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
1089; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
1090; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
1091; SSE41-NEXT:    retq
1092;
1093; AVX1-LABEL: test_v16f32_align32:
1094; AVX1:       # %bb.0:
1095; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
1096; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
1097; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1098; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
1099; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
1100; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1101; AVX1-NEXT:    retq
1102;
1103; AVX2-LABEL: test_v16f32_align32:
1104; AVX2:       # %bb.0:
1105; AVX2-NEXT:    vmovntdqa (%rdi), %ymm0
1106; AVX2-NEXT:    vmovntdqa 32(%rdi), %ymm1
1107; AVX2-NEXT:    retq
1108;
1109; AVX512-LABEL: test_v16f32_align32:
1110; AVX512:       # %bb.0:
1111; AVX512-NEXT:    pushq %rbp
1112; AVX512-NEXT:    movq %rsp, %rbp
1113; AVX512-NEXT:    andq $-64, %rsp
1114; AVX512-NEXT:    subq $128, %rsp
1115; AVX512-NEXT:    vmovntdqa 32(%rdi), %ymm0
1116; AVX512-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1117; AVX512-NEXT:    vmovntdqa (%rdi), %ymm0
1118; AVX512-NEXT:    vmovdqa %ymm0, (%rsp)
1119; AVX512-NEXT:    vmovaps (%rsp), %zmm0
1120; AVX512-NEXT:    movq %rbp, %rsp
1121; AVX512-NEXT:    popq %rbp
1122; AVX512-NEXT:    retq
1123  %1 = load <16 x float>, <16 x float>* %src, align 32, !nontemporal !1
1124  ret <16 x float> %1
1125}
1126
1127define <8 x i64> @test_v8i64_align32(<8 x i64>* %src) nounwind {
1128; SSE2-LABEL: test_v8i64_align32:
1129; SSE2:       # %bb.0:
1130; SSE2-NEXT:    movaps (%rdi), %xmm0
1131; SSE2-NEXT:    movaps 16(%rdi), %xmm1
1132; SSE2-NEXT:    movaps 32(%rdi), %xmm2
1133; SSE2-NEXT:    movaps 48(%rdi), %xmm3
1134; SSE2-NEXT:    retq
1135;
1136; SSE41-LABEL: test_v8i64_align32:
1137; SSE41:       # %bb.0:
1138; SSE41-NEXT:    movntdqa (%rdi), %xmm0
1139; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
1140; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
1141; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
1142; SSE41-NEXT:    retq
1143;
1144; AVX1-LABEL: test_v8i64_align32:
1145; AVX1:       # %bb.0:
1146; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
1147; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
1148; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1149; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
1150; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
1151; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1152; AVX1-NEXT:    retq
1153;
1154; AVX2-LABEL: test_v8i64_align32:
1155; AVX2:       # %bb.0:
1156; AVX2-NEXT:    vmovntdqa (%rdi), %ymm0
1157; AVX2-NEXT:    vmovntdqa 32(%rdi), %ymm1
1158; AVX2-NEXT:    retq
1159;
1160; AVX512-LABEL: test_v8i64_align32:
1161; AVX512:       # %bb.0:
1162; AVX512-NEXT:    pushq %rbp
1163; AVX512-NEXT:    movq %rsp, %rbp
1164; AVX512-NEXT:    andq $-64, %rsp
1165; AVX512-NEXT:    subq $128, %rsp
1166; AVX512-NEXT:    vmovntdqa 32(%rdi), %ymm0
1167; AVX512-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1168; AVX512-NEXT:    vmovntdqa (%rdi), %ymm0
1169; AVX512-NEXT:    vmovdqa %ymm0, (%rsp)
1170; AVX512-NEXT:    vmovaps (%rsp), %zmm0
1171; AVX512-NEXT:    movq %rbp, %rsp
1172; AVX512-NEXT:    popq %rbp
1173; AVX512-NEXT:    retq
1174  %1 = load <8 x i64>, <8 x i64>* %src, align 32, !nontemporal !1
1175  ret <8 x i64> %1
1176}
1177
1178define <16 x i32> @test_v16i32_align32(<16 x i32>* %src) nounwind {
1179; SSE2-LABEL: test_v16i32_align32:
1180; SSE2:       # %bb.0:
1181; SSE2-NEXT:    movaps (%rdi), %xmm0
1182; SSE2-NEXT:    movaps 16(%rdi), %xmm1
1183; SSE2-NEXT:    movaps 32(%rdi), %xmm2
1184; SSE2-NEXT:    movaps 48(%rdi), %xmm3
1185; SSE2-NEXT:    retq
1186;
1187; SSE41-LABEL: test_v16i32_align32:
1188; SSE41:       # %bb.0:
1189; SSE41-NEXT:    movntdqa (%rdi), %xmm0
1190; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
1191; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
1192; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
1193; SSE41-NEXT:    retq
1194;
1195; AVX1-LABEL: test_v16i32_align32:
1196; AVX1:       # %bb.0:
1197; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
1198; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
1199; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1200; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
1201; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
1202; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1203; AVX1-NEXT:    retq
1204;
1205; AVX2-LABEL: test_v16i32_align32:
1206; AVX2:       # %bb.0:
1207; AVX2-NEXT:    vmovntdqa (%rdi), %ymm0
1208; AVX2-NEXT:    vmovntdqa 32(%rdi), %ymm1
1209; AVX2-NEXT:    retq
1210;
1211; AVX512-LABEL: test_v16i32_align32:
1212; AVX512:       # %bb.0:
1213; AVX512-NEXT:    pushq %rbp
1214; AVX512-NEXT:    movq %rsp, %rbp
1215; AVX512-NEXT:    andq $-64, %rsp
1216; AVX512-NEXT:    subq $128, %rsp
1217; AVX512-NEXT:    vmovntdqa 32(%rdi), %ymm0
1218; AVX512-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1219; AVX512-NEXT:    vmovntdqa (%rdi), %ymm0
1220; AVX512-NEXT:    vmovdqa %ymm0, (%rsp)
1221; AVX512-NEXT:    vmovaps (%rsp), %zmm0
1222; AVX512-NEXT:    movq %rbp, %rsp
1223; AVX512-NEXT:    popq %rbp
1224; AVX512-NEXT:    retq
1225  %1 = load <16 x i32>, <16 x i32>* %src, align 32, !nontemporal !1
1226  ret <16 x i32> %1
1227}
1228
1229define <32 x i16> @test_v32i16_align32(<32 x i16>* %src) nounwind {
1230; SSE2-LABEL: test_v32i16_align32:
1231; SSE2:       # %bb.0:
1232; SSE2-NEXT:    movaps (%rdi), %xmm0
1233; SSE2-NEXT:    movaps 16(%rdi), %xmm1
1234; SSE2-NEXT:    movaps 32(%rdi), %xmm2
1235; SSE2-NEXT:    movaps 48(%rdi), %xmm3
1236; SSE2-NEXT:    retq
1237;
1238; SSE41-LABEL: test_v32i16_align32:
1239; SSE41:       # %bb.0:
1240; SSE41-NEXT:    movntdqa (%rdi), %xmm0
1241; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
1242; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
1243; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
1244; SSE41-NEXT:    retq
1245;
1246; AVX1-LABEL: test_v32i16_align32:
1247; AVX1:       # %bb.0:
1248; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
1249; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
1250; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1251; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
1252; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
1253; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1254; AVX1-NEXT:    retq
1255;
1256; AVX2-LABEL: test_v32i16_align32:
1257; AVX2:       # %bb.0:
1258; AVX2-NEXT:    vmovntdqa (%rdi), %ymm0
1259; AVX2-NEXT:    vmovntdqa 32(%rdi), %ymm1
1260; AVX2-NEXT:    retq
1261;
1262; AVX512-LABEL: test_v32i16_align32:
1263; AVX512:       # %bb.0:
1264; AVX512-NEXT:    pushq %rbp
1265; AVX512-NEXT:    movq %rsp, %rbp
1266; AVX512-NEXT:    andq $-64, %rsp
1267; AVX512-NEXT:    subq $128, %rsp
1268; AVX512-NEXT:    vmovntdqa 32(%rdi), %ymm0
1269; AVX512-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1270; AVX512-NEXT:    vmovntdqa (%rdi), %ymm0
1271; AVX512-NEXT:    vmovdqa %ymm0, (%rsp)
1272; AVX512-NEXT:    vmovaps (%rsp), %zmm0
1273; AVX512-NEXT:    movq %rbp, %rsp
1274; AVX512-NEXT:    popq %rbp
1275; AVX512-NEXT:    retq
1276  %1 = load <32 x i16>, <32 x i16>* %src, align 32, !nontemporal !1
1277  ret <32 x i16> %1
1278}
1279
1280define <64 x i8> @test_v64i8_align32(<64 x i8>* %src) nounwind {
1281; SSE2-LABEL: test_v64i8_align32:
1282; SSE2:       # %bb.0:
1283; SSE2-NEXT:    movaps (%rdi), %xmm0
1284; SSE2-NEXT:    movaps 16(%rdi), %xmm1
1285; SSE2-NEXT:    movaps 32(%rdi), %xmm2
1286; SSE2-NEXT:    movaps 48(%rdi), %xmm3
1287; SSE2-NEXT:    retq
1288;
1289; SSE41-LABEL: test_v64i8_align32:
1290; SSE41:       # %bb.0:
1291; SSE41-NEXT:    movntdqa (%rdi), %xmm0
1292; SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
1293; SSE41-NEXT:    movntdqa 32(%rdi), %xmm2
1294; SSE41-NEXT:    movntdqa 48(%rdi), %xmm3
1295; SSE41-NEXT:    retq
1296;
1297; AVX1-LABEL: test_v64i8_align32:
1298; AVX1:       # %bb.0:
1299; AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
1300; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
1301; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1302; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
1303; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
1304; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1305; AVX1-NEXT:    retq
1306;
1307; AVX2-LABEL: test_v64i8_align32:
1308; AVX2:       # %bb.0:
1309; AVX2-NEXT:    vmovntdqa (%rdi), %ymm0
1310; AVX2-NEXT:    vmovntdqa 32(%rdi), %ymm1
1311; AVX2-NEXT:    retq
1312;
1313; AVX512-LABEL: test_v64i8_align32:
1314; AVX512:       # %bb.0:
1315; AVX512-NEXT:    pushq %rbp
1316; AVX512-NEXT:    movq %rsp, %rbp
1317; AVX512-NEXT:    andq $-64, %rsp
1318; AVX512-NEXT:    subq $128, %rsp
1319; AVX512-NEXT:    vmovntdqa 32(%rdi), %ymm0
1320; AVX512-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1321; AVX512-NEXT:    vmovntdqa (%rdi), %ymm0
1322; AVX512-NEXT:    vmovdqa %ymm0, (%rsp)
1323; AVX512-NEXT:    vmovaps (%rsp), %zmm0
1324; AVX512-NEXT:    movq %rbp, %rsp
1325; AVX512-NEXT:    popq %rbp
1326; AVX512-NEXT:    retq
1327  %1 = load <64 x i8>, <64 x i8>* %src, align 32, !nontemporal !1
1328  ret <64 x i8> %1
1329}
1330
1331!1 = !{i32 1}
1332