1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X86,X86-SSE4A
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64-SSE,X64-SSE2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X64-SSE,X64-SSE4A
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X64-SSE,X64-SSE41
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
9
10;
11; PR42123
12;
13
14define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) nounwind {
15; X86-LABEL: merge_2_v4f32_align32:
16; X86:       # %bb.0:
17; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
18; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
19; X86-NEXT:    movaps (%ecx), %xmm0
20; X86-NEXT:    movaps 16(%ecx), %xmm1
21; X86-NEXT:    movntps %xmm0, (%eax)
22; X86-NEXT:    movntps %xmm1, 16(%eax)
23; X86-NEXT:    retl
24;
25; X64-SSE2-LABEL: merge_2_v4f32_align32:
26; X64-SSE2:       # %bb.0:
27; X64-SSE2-NEXT:    movaps (%rdi), %xmm0
28; X64-SSE2-NEXT:    movaps 16(%rdi), %xmm1
29; X64-SSE2-NEXT:    movntps %xmm0, (%rsi)
30; X64-SSE2-NEXT:    movntps %xmm1, 16(%rsi)
31; X64-SSE2-NEXT:    retq
32;
33; X64-SSE4A-LABEL: merge_2_v4f32_align32:
34; X64-SSE4A:       # %bb.0:
35; X64-SSE4A-NEXT:    movaps (%rdi), %xmm0
36; X64-SSE4A-NEXT:    movaps 16(%rdi), %xmm1
37; X64-SSE4A-NEXT:    movntps %xmm0, (%rsi)
38; X64-SSE4A-NEXT:    movntps %xmm1, 16(%rsi)
39; X64-SSE4A-NEXT:    retq
40;
41; X64-SSE41-LABEL: merge_2_v4f32_align32:
42; X64-SSE41:       # %bb.0:
43; X64-SSE41-NEXT:    movntdqa (%rdi), %xmm0
44; X64-SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
45; X64-SSE41-NEXT:    movntdq %xmm0, (%rsi)
46; X64-SSE41-NEXT:    movntdq %xmm1, 16(%rsi)
47; X64-SSE41-NEXT:    retq
48;
49; X64-AVX1-LABEL: merge_2_v4f32_align32:
50; X64-AVX1:       # %bb.0:
51; X64-AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
52; X64-AVX1-NEXT:    vmovntdqa (%rdi), %xmm1
53; X64-AVX1-NEXT:    vmovntdq %xmm1, (%rsi)
54; X64-AVX1-NEXT:    vmovntdq %xmm0, 16(%rsi)
55; X64-AVX1-NEXT:    retq
56;
57; X64-AVX2-LABEL: merge_2_v4f32_align32:
58; X64-AVX2:       # %bb.0:
59; X64-AVX2-NEXT:    vmovntdqa (%rdi), %ymm0
60; X64-AVX2-NEXT:    vmovntdq %ymm0, (%rsi)
61; X64-AVX2-NEXT:    vzeroupper
62; X64-AVX2-NEXT:    retq
63  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
64  %2 = bitcast float* %1 to <4 x float>*
65  %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0
66  %4 = load <4 x float>, <4 x float>* %2, align 16, !nontemporal !0
67  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
68  %6 = bitcast float* %5 to <4 x float>*
69  store <4 x float> %3, <4 x float>* %a1, align 32, !nontemporal !0
70  store <4 x float> %4, <4 x float>* %6, align 16, !nontemporal !0
71  ret void
72}
73
74; Don't merge nt and non-nt loads even if aligned.
75define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
76; X86-LABEL: merge_2_v4f32_align32_mix_ntload:
77; X86:       # %bb.0:
78; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
79; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
80; X86-NEXT:    movaps (%ecx), %xmm0
81; X86-NEXT:    movaps 16(%ecx), %xmm1
82; X86-NEXT:    movaps %xmm0, (%eax)
83; X86-NEXT:    movaps %xmm1, 16(%eax)
84; X86-NEXT:    retl
85;
86; X64-SSE2-LABEL: merge_2_v4f32_align32_mix_ntload:
87; X64-SSE2:       # %bb.0:
88; X64-SSE2-NEXT:    movaps (%rdi), %xmm0
89; X64-SSE2-NEXT:    movaps 16(%rdi), %xmm1
90; X64-SSE2-NEXT:    movaps %xmm0, (%rsi)
91; X64-SSE2-NEXT:    movaps %xmm1, 16(%rsi)
92; X64-SSE2-NEXT:    retq
93;
94; X64-SSE4A-LABEL: merge_2_v4f32_align32_mix_ntload:
95; X64-SSE4A:       # %bb.0:
96; X64-SSE4A-NEXT:    movaps (%rdi), %xmm0
97; X64-SSE4A-NEXT:    movaps 16(%rdi), %xmm1
98; X64-SSE4A-NEXT:    movaps %xmm0, (%rsi)
99; X64-SSE4A-NEXT:    movaps %xmm1, 16(%rsi)
100; X64-SSE4A-NEXT:    retq
101;
102; X64-SSE41-LABEL: merge_2_v4f32_align32_mix_ntload:
103; X64-SSE41:       # %bb.0:
104; X64-SSE41-NEXT:    movntdqa (%rdi), %xmm0
105; X64-SSE41-NEXT:    movaps 16(%rdi), %xmm1
106; X64-SSE41-NEXT:    movdqa %xmm0, (%rsi)
107; X64-SSE41-NEXT:    movaps %xmm1, 16(%rsi)
108; X64-SSE41-NEXT:    retq
109;
110; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload:
111; X64-AVX:       # %bb.0:
112; X64-AVX-NEXT:    vmovntdqa (%rdi), %xmm0
113; X64-AVX-NEXT:    vmovaps 16(%rdi), %xmm1
114; X64-AVX-NEXT:    vmovdqa %xmm0, (%rsi)
115; X64-AVX-NEXT:    vmovaps %xmm1, 16(%rsi)
116; X64-AVX-NEXT:    retq
117  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
118  %2 = bitcast float* %1 to <4 x float>*
119  %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0
120  %4 = load <4 x float>, <4 x float>* %2, align 16
121  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
122  %6 = bitcast float* %5 to <4 x float>*
123  store <4 x float> %3, <4 x float>* %a1, align 32
124  store <4 x float> %4, <4 x float>* %6, align 16
125  ret void
126}
127
128; Don't merge nt and non-nt stores even if aligned.
129define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
130; X86-LABEL: merge_2_v4f32_align32_mix_ntstore:
131; X86:       # %bb.0:
132; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
133; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
134; X86-NEXT:    movaps (%ecx), %xmm0
135; X86-NEXT:    movaps 16(%ecx), %xmm1
136; X86-NEXT:    movntps %xmm0, (%eax)
137; X86-NEXT:    movaps %xmm1, 16(%eax)
138; X86-NEXT:    retl
139;
140; X64-SSE-LABEL: merge_2_v4f32_align32_mix_ntstore:
141; X64-SSE:       # %bb.0:
142; X64-SSE-NEXT:    movaps (%rdi), %xmm0
143; X64-SSE-NEXT:    movaps 16(%rdi), %xmm1
144; X64-SSE-NEXT:    movntps %xmm0, (%rsi)
145; X64-SSE-NEXT:    movaps %xmm1, 16(%rsi)
146; X64-SSE-NEXT:    retq
147;
148; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore:
149; X64-AVX:       # %bb.0:
150; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
151; X64-AVX-NEXT:    vmovaps 16(%rdi), %xmm1
152; X64-AVX-NEXT:    vmovntps %xmm0, (%rsi)
153; X64-AVX-NEXT:    vmovaps %xmm1, 16(%rsi)
154; X64-AVX-NEXT:    retq
155  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
156  %2 = bitcast float* %1 to <4 x float>*
157  %3 = load <4 x float>, <4 x float>* %a0, align 32
158  %4 = load <4 x float>, <4 x float>* %2, align 16
159  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
160  %6 = bitcast float* %5 to <4 x float>*
161  store <4 x float> %3, <4 x float>* %a1, align 32, !nontemporal !0
162  store <4 x float> %4, <4 x float>* %6, align 16
163  ret void
164}
165
166; AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
167; Must be kept seperate as VMOVNTDQA xmm.
168define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
169; X86-LABEL: merge_2_v4f32_align16_ntload:
170; X86:       # %bb.0:
171; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
172; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
173; X86-NEXT:    movaps (%ecx), %xmm0
174; X86-NEXT:    movaps 16(%ecx), %xmm1
175; X86-NEXT:    movaps %xmm0, (%eax)
176; X86-NEXT:    movaps %xmm1, 16(%eax)
177; X86-NEXT:    retl
178;
179; X64-SSE2-LABEL: merge_2_v4f32_align16_ntload:
180; X64-SSE2:       # %bb.0:
181; X64-SSE2-NEXT:    movaps (%rdi), %xmm0
182; X64-SSE2-NEXT:    movaps 16(%rdi), %xmm1
183; X64-SSE2-NEXT:    movaps %xmm0, (%rsi)
184; X64-SSE2-NEXT:    movaps %xmm1, 16(%rsi)
185; X64-SSE2-NEXT:    retq
186;
187; X64-SSE4A-LABEL: merge_2_v4f32_align16_ntload:
188; X64-SSE4A:       # %bb.0:
189; X64-SSE4A-NEXT:    movaps (%rdi), %xmm0
190; X64-SSE4A-NEXT:    movaps 16(%rdi), %xmm1
191; X64-SSE4A-NEXT:    movaps %xmm0, (%rsi)
192; X64-SSE4A-NEXT:    movaps %xmm1, 16(%rsi)
193; X64-SSE4A-NEXT:    retq
194;
195; X64-SSE41-LABEL: merge_2_v4f32_align16_ntload:
196; X64-SSE41:       # %bb.0:
197; X64-SSE41-NEXT:    movntdqa (%rdi), %xmm0
198; X64-SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
199; X64-SSE41-NEXT:    movdqa %xmm0, (%rsi)
200; X64-SSE41-NEXT:    movdqa %xmm1, 16(%rsi)
201; X64-SSE41-NEXT:    retq
202;
203; X64-AVX-LABEL: merge_2_v4f32_align16_ntload:
204; X64-AVX:       # %bb.0:
205; X64-AVX-NEXT:    vmovntdqa (%rdi), %xmm0
206; X64-AVX-NEXT:    vmovntdqa 16(%rdi), %xmm1
207; X64-AVX-NEXT:    vmovdqa %xmm0, (%rsi)
208; X64-AVX-NEXT:    vmovdqa %xmm1, 16(%rsi)
209; X64-AVX-NEXT:    retq
210  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
211  %2 = bitcast float* %1 to <4 x float>*
212  %3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0
213  %4 = load <4 x float>, <4 x float>* %2, align 16, !nontemporal !0
214  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
215  %6 = bitcast float* %5 to <4 x float>*
216  store <4 x float> %3, <4 x float>* %a1, align 16
217  store <4 x float> %4, <4 x float>* %6, align 16
218  ret void
219}
220
221; AVX can't perform NT-store-ymm on 16-byte aligned memory.
222; Must be kept seperate as VMOVNTPS xmm.
223define void @merge_2_v4f32_align16_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
224; X86-LABEL: merge_2_v4f32_align16_ntstore:
225; X86:       # %bb.0:
226; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
227; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
228; X86-NEXT:    movaps (%ecx), %xmm0
229; X86-NEXT:    movaps 16(%ecx), %xmm1
230; X86-NEXT:    movntps %xmm0, (%eax)
231; X86-NEXT:    movntps %xmm1, 16(%eax)
232; X86-NEXT:    retl
233;
234; X64-SSE-LABEL: merge_2_v4f32_align16_ntstore:
235; X64-SSE:       # %bb.0:
236; X64-SSE-NEXT:    movaps (%rdi), %xmm0
237; X64-SSE-NEXT:    movaps 16(%rdi), %xmm1
238; X64-SSE-NEXT:    movntps %xmm0, (%rsi)
239; X64-SSE-NEXT:    movntps %xmm1, 16(%rsi)
240; X64-SSE-NEXT:    retq
241;
242; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore:
243; X64-AVX:       # %bb.0:
244; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
245; X64-AVX-NEXT:    vmovaps 16(%rdi), %xmm1
246; X64-AVX-NEXT:    vmovntps %xmm0, (%rsi)
247; X64-AVX-NEXT:    vmovntps %xmm1, 16(%rsi)
248; X64-AVX-NEXT:    retq
249  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
250  %2 = bitcast float* %1 to <4 x float>*
251  %3 = load <4 x float>, <4 x float>* %a0, align 16
252  %4 = load <4 x float>, <4 x float>* %2, align 16
253  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
254  %6 = bitcast float* %5 to <4 x float>*
255  store <4 x float> %3, <4 x float>* %a1, align 16, !nontemporal !0
256  store <4 x float> %4, <4 x float>* %6, align 16, !nontemporal !0
257  ret void
258}
259
260; Nothing can perform NT-load-vector on 1-byte aligned memory.
261; Just perform regular loads.
262define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
263; X86-LABEL: merge_2_v4f32_align1_ntload:
264; X86:       # %bb.0:
265; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
266; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
267; X86-NEXT:    movups (%ecx), %xmm0
268; X86-NEXT:    movups 16(%ecx), %xmm1
269; X86-NEXT:    movups %xmm0, (%eax)
270; X86-NEXT:    movups %xmm1, 16(%eax)
271; X86-NEXT:    retl
272;
273; X64-SSE-LABEL: merge_2_v4f32_align1_ntload:
274; X64-SSE:       # %bb.0:
275; X64-SSE-NEXT:    movups (%rdi), %xmm0
276; X64-SSE-NEXT:    movups 16(%rdi), %xmm1
277; X64-SSE-NEXT:    movups %xmm0, (%rsi)
278; X64-SSE-NEXT:    movups %xmm1, 16(%rsi)
279; X64-SSE-NEXT:    retq
280;
281; X64-AVX-LABEL: merge_2_v4f32_align1_ntload:
282; X64-AVX:       # %bb.0:
283; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
284; X64-AVX-NEXT:    vmovups %ymm0, (%rsi)
285; X64-AVX-NEXT:    vzeroupper
286; X64-AVX-NEXT:    retq
287  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
288  %2 = bitcast float* %1 to <4 x float>*
289  %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0
290  %4 = load <4 x float>, <4 x float>* %2, align 1, !nontemporal !0
291  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
292  %6 = bitcast float* %5 to <4 x float>*
293  store <4 x float> %3, <4 x float>* %a1, align 1
294  store <4 x float> %4, <4 x float>* %6, align 1
295  ret void
296}
297
298; Nothing can perform NT-store-vector on 1-byte aligned memory.
299; Must be scalarized to use MOVTNI/MOVNTSD.
300define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
301; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
302; X86-SSE2:       # %bb.0:
303; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
304; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
305; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
306; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
307; X86-SSE2-NEXT:    movd %xmm0, %ecx
308; X86-SSE2-NEXT:    movntil %ecx, (%eax)
309; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
310; X86-SSE2-NEXT:    movd %xmm2, %ecx
311; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
312; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
313; X86-SSE2-NEXT:    movd %xmm2, %ecx
314; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
315; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
316; X86-SSE2-NEXT:    movd %xmm0, %ecx
317; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
318; X86-SSE2-NEXT:    movd %xmm1, %ecx
319; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
320; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
321; X86-SSE2-NEXT:    movd %xmm0, %ecx
322; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
323; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
324; X86-SSE2-NEXT:    movd %xmm0, %ecx
325; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
326; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
327; X86-SSE2-NEXT:    movd %xmm0, %ecx
328; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
329; X86-SSE2-NEXT:    retl
330;
331; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
332; X86-SSE4A:       # %bb.0:
333; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
334; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %ecx
335; X86-SSE4A-NEXT:    movups (%ecx), %xmm0
336; X86-SSE4A-NEXT:    movups 16(%ecx), %xmm1
337; X86-SSE4A-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
338; X86-SSE4A-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
339; X86-SSE4A-NEXT:    movntsd %xmm2, 8(%eax)
340; X86-SSE4A-NEXT:    movntsd %xmm0, (%eax)
341; X86-SSE4A-NEXT:    movntsd %xmm3, 24(%eax)
342; X86-SSE4A-NEXT:    movntsd %xmm1, 16(%eax)
343; X86-SSE4A-NEXT:    retl
344;
345; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
346; X64-SSE2:       # %bb.0:
347; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
348; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
349; X64-SSE2-NEXT:    movq %xmm0, %rax
350; X64-SSE2-NEXT:    movntiq %rax, (%rsi)
351; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
352; X64-SSE2-NEXT:    movq %xmm0, %rax
353; X64-SSE2-NEXT:    movntiq %rax, 8(%rsi)
354; X64-SSE2-NEXT:    movq %xmm1, %rax
355; X64-SSE2-NEXT:    movntiq %rax, 16(%rsi)
356; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
357; X64-SSE2-NEXT:    movq %xmm0, %rax
358; X64-SSE2-NEXT:    movntiq %rax, 24(%rsi)
359; X64-SSE2-NEXT:    retq
360;
361; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
362; X64-SSE4A:       # %bb.0:
363; X64-SSE4A-NEXT:    movups (%rdi), %xmm0
364; X64-SSE4A-NEXT:    movups 16(%rdi), %xmm1
365; X64-SSE4A-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
366; X64-SSE4A-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
367; X64-SSE4A-NEXT:    movntsd %xmm2, 8(%rsi)
368; X64-SSE4A-NEXT:    movntsd %xmm0, (%rsi)
369; X64-SSE4A-NEXT:    movntsd %xmm3, 24(%rsi)
370; X64-SSE4A-NEXT:    movntsd %xmm1, 16(%rsi)
371; X64-SSE4A-NEXT:    retq
372;
373; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
374; X64-SSE41:       # %bb.0:
375; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
376; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
377; X64-SSE41-NEXT:    pextrq $1, %xmm0, %rax
378; X64-SSE41-NEXT:    movntiq %rax, 8(%rsi)
379; X64-SSE41-NEXT:    movq %xmm0, %rax
380; X64-SSE41-NEXT:    movntiq %rax, (%rsi)
381; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
382; X64-SSE41-NEXT:    movntiq %rax, 24(%rsi)
383; X64-SSE41-NEXT:    movq %xmm1, %rax
384; X64-SSE41-NEXT:    movntiq %rax, 16(%rsi)
385; X64-SSE41-NEXT:    retq
386;
387; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore:
388; X64-AVX:       # %bb.0:
389; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
390; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
391; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
392; X64-AVX-NEXT:    movntiq %rax, 8(%rsi)
393; X64-AVX-NEXT:    vmovq %xmm0, %rax
394; X64-AVX-NEXT:    movntiq %rax, (%rsi)
395; X64-AVX-NEXT:    vpextrq $1, %xmm1, %rax
396; X64-AVX-NEXT:    movntiq %rax, 24(%rsi)
397; X64-AVX-NEXT:    vmovq %xmm1, %rax
398; X64-AVX-NEXT:    movntiq %rax, 16(%rsi)
399; X64-AVX-NEXT:    retq
400  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
401  %2 = bitcast float* %1 to <4 x float>*
402  %3 = load <4 x float>, <4 x float>* %a0, align 1
403  %4 = load <4 x float>, <4 x float>* %2, align 1
404  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
405  %6 = bitcast float* %5 to <4 x float>*
406  store <4 x float> %3, <4 x float>* %a1, align 1, !nontemporal !0
407  store <4 x float> %4, <4 x float>* %6, align 1, !nontemporal !0
408  ret void
409}
410
411; Nothing can perform NT-load-vector on 1-byte aligned memory.
412; Just perform regular loads and scalarize NT-stores.
413define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
414; X86-SSE2-LABEL: merge_2_v4f32_align1:
415; X86-SSE2:       # %bb.0:
416; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
417; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
418; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
419; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
420; X86-SSE2-NEXT:    movd %xmm0, %ecx
421; X86-SSE2-NEXT:    movntil %ecx, (%eax)
422; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
423; X86-SSE2-NEXT:    movd %xmm2, %ecx
424; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
425; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
426; X86-SSE2-NEXT:    movd %xmm2, %ecx
427; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
428; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
429; X86-SSE2-NEXT:    movd %xmm0, %ecx
430; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
431; X86-SSE2-NEXT:    movd %xmm1, %ecx
432; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
433; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
434; X86-SSE2-NEXT:    movd %xmm0, %ecx
435; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
436; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
437; X86-SSE2-NEXT:    movd %xmm0, %ecx
438; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
439; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
440; X86-SSE2-NEXT:    movd %xmm0, %ecx
441; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
442; X86-SSE2-NEXT:    retl
443;
444; X86-SSE4A-LABEL: merge_2_v4f32_align1:
445; X86-SSE4A:       # %bb.0:
446; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
447; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %ecx
448; X86-SSE4A-NEXT:    movups (%ecx), %xmm0
449; X86-SSE4A-NEXT:    movups 16(%ecx), %xmm1
450; X86-SSE4A-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
451; X86-SSE4A-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
452; X86-SSE4A-NEXT:    movntsd %xmm2, 8(%eax)
453; X86-SSE4A-NEXT:    movntsd %xmm0, (%eax)
454; X86-SSE4A-NEXT:    movntsd %xmm3, 24(%eax)
455; X86-SSE4A-NEXT:    movntsd %xmm1, 16(%eax)
456; X86-SSE4A-NEXT:    retl
457;
458; X64-SSE2-LABEL: merge_2_v4f32_align1:
459; X64-SSE2:       # %bb.0:
460; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
461; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
462; X64-SSE2-NEXT:    movq %xmm0, %rax
463; X64-SSE2-NEXT:    movntiq %rax, (%rsi)
464; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
465; X64-SSE2-NEXT:    movq %xmm0, %rax
466; X64-SSE2-NEXT:    movntiq %rax, 8(%rsi)
467; X64-SSE2-NEXT:    movq %xmm1, %rax
468; X64-SSE2-NEXT:    movntiq %rax, 16(%rsi)
469; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
470; X64-SSE2-NEXT:    movq %xmm0, %rax
471; X64-SSE2-NEXT:    movntiq %rax, 24(%rsi)
472; X64-SSE2-NEXT:    retq
473;
474; X64-SSE4A-LABEL: merge_2_v4f32_align1:
475; X64-SSE4A:       # %bb.0:
476; X64-SSE4A-NEXT:    movups (%rdi), %xmm0
477; X64-SSE4A-NEXT:    movups 16(%rdi), %xmm1
478; X64-SSE4A-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
479; X64-SSE4A-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
480; X64-SSE4A-NEXT:    movntsd %xmm2, 8(%rsi)
481; X64-SSE4A-NEXT:    movntsd %xmm0, (%rsi)
482; X64-SSE4A-NEXT:    movntsd %xmm3, 24(%rsi)
483; X64-SSE4A-NEXT:    movntsd %xmm1, 16(%rsi)
484; X64-SSE4A-NEXT:    retq
485;
486; X64-SSE41-LABEL: merge_2_v4f32_align1:
487; X64-SSE41:       # %bb.0:
488; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
489; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
490; X64-SSE41-NEXT:    pextrq $1, %xmm0, %rax
491; X64-SSE41-NEXT:    movntiq %rax, 8(%rsi)
492; X64-SSE41-NEXT:    movq %xmm0, %rax
493; X64-SSE41-NEXT:    movntiq %rax, (%rsi)
494; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
495; X64-SSE41-NEXT:    movntiq %rax, 24(%rsi)
496; X64-SSE41-NEXT:    movq %xmm1, %rax
497; X64-SSE41-NEXT:    movntiq %rax, 16(%rsi)
498; X64-SSE41-NEXT:    retq
499;
500; X64-AVX-LABEL: merge_2_v4f32_align1:
501; X64-AVX:       # %bb.0:
502; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
503; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
504; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
505; X64-AVX-NEXT:    movntiq %rax, 8(%rsi)
506; X64-AVX-NEXT:    vmovq %xmm0, %rax
507; X64-AVX-NEXT:    movntiq %rax, (%rsi)
508; X64-AVX-NEXT:    vpextrq $1, %xmm1, %rax
509; X64-AVX-NEXT:    movntiq %rax, 24(%rsi)
510; X64-AVX-NEXT:    vmovq %xmm1, %rax
511; X64-AVX-NEXT:    movntiq %rax, 16(%rsi)
512; X64-AVX-NEXT:    retq
513  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
514  %2 = bitcast float* %1 to <4 x float>*
515  %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0
516  %4 = load <4 x float>, <4 x float>* %2, align 1, !nontemporal !0
517  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
518  %6 = bitcast float* %5 to <4 x float>*
519  store <4 x float> %3, <4 x float>* %a1, align 1, !nontemporal !0
520  store <4 x float> %4, <4 x float>* %6, align 1, !nontemporal !0
521  ret void
522}
523
524!0 = !{i32 1}
525