1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=ALL32,SSE32
3; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=ALL64,SSE64
4; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVXONLY32
5; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVXONLY64
6; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVX51232
7; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVX51264
8; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVX51232
9; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVX51264
10
11define i32 @test_store_32(i32* nocapture %addr, i32 %value) {
12; ALL32-LABEL: test_store_32:
13; ALL32:       # %bb.0: # %entry
14; ALL32-NEXT:    movl %esi, %eax
15; ALL32-NEXT:    movl %esi, (%rdi)
16; ALL32-NEXT:    retq
17;
18; ALL64-LABEL: test_store_32:
19; ALL64:       # %bb.0: # %entry
20; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %eax
21; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
22; ALL64-NEXT:    movl %eax, (%ecx)
23; ALL64-NEXT:    retl
24entry:
25  store i32 %value, i32* %addr, align 1
26  ret i32 %value
27}
28
29define i16 @test_store_16(i16* nocapture %addr, i16 %value) {
30; ALL32-LABEL: test_store_16:
31; ALL32:       # %bb.0: # %entry
32; ALL32-NEXT:    movl %esi, %eax
33; ALL32-NEXT:    movw %ax, (%rdi)
34; ALL32-NEXT:    # kill: def $ax killed $ax killed $eax
35; ALL32-NEXT:    retq
36;
37; ALL64-LABEL: test_store_16:
38; ALL64:       # %bb.0: # %entry
39; ALL64-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
40; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
41; ALL64-NEXT:    movw %ax, (%ecx)
42; ALL64-NEXT:    retl
43entry:
44  store i16 %value, i16* %addr, align 1
45  ret i16 %value
46}
47
48define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
49; SSE32-LABEL: test_store_4xi32:
50; SSE32:       # %bb.0:
51; SSE32-NEXT:    paddd %xmm1, %xmm0
52; SSE32-NEXT:    movdqu %xmm0, (%rdi)
53; SSE32-NEXT:    retq
54;
55; SSE64-LABEL: test_store_4xi32:
56; SSE64:       # %bb.0:
57; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
58; SSE64-NEXT:    paddd %xmm1, %xmm0
59; SSE64-NEXT:    movdqu %xmm0, (%eax)
60; SSE64-NEXT:    retl
61;
62; AVX32-LABEL: test_store_4xi32:
63; AVX32:       # %bb.0:
64; AVX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
65; AVX32-NEXT:    vmovdqu %xmm0, (%rdi)
66; AVX32-NEXT:    retq
67;
68; AVX64-LABEL: test_store_4xi32:
69; AVX64:       # %bb.0:
70; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
71; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
72; AVX64-NEXT:    vmovdqu %xmm0, (%eax)
73; AVX64-NEXT:    retl
74  %foo = add <4 x i32> %value, %value2 ; to force integer type on store
75  store <4 x i32> %foo, <4 x i32>* %addr, align 1
76  ret <4 x i32> %foo
77}
78
79define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
80; SSE32-LABEL: test_store_4xi32_aligned:
81; SSE32:       # %bb.0:
82; SSE32-NEXT:    paddd %xmm1, %xmm0
83; SSE32-NEXT:    movdqa %xmm0, (%rdi)
84; SSE32-NEXT:    retq
85;
86; SSE64-LABEL: test_store_4xi32_aligned:
87; SSE64:       # %bb.0:
88; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
89; SSE64-NEXT:    paddd %xmm1, %xmm0
90; SSE64-NEXT:    movdqa %xmm0, (%eax)
91; SSE64-NEXT:    retl
92;
93; AVX32-LABEL: test_store_4xi32_aligned:
94; AVX32:       # %bb.0:
95; AVX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
96; AVX32-NEXT:    vmovdqa %xmm0, (%rdi)
97; AVX32-NEXT:    retq
98;
99; AVX64-LABEL: test_store_4xi32_aligned:
100; AVX64:       # %bb.0:
101; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
102; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
103; AVX64-NEXT:    vmovdqa %xmm0, (%eax)
104; AVX64-NEXT:    retl
105  %foo = add <4 x i32> %value, %value2 ; to force integer type on store
106  store <4 x i32> %foo, <4 x i32>* %addr, align 16
107  ret <4 x i32> %foo
108}
109
110define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) {
111; SSE32-LABEL: test_store_4xf32:
112; SSE32:       # %bb.0:
113; SSE32-NEXT:    movups %xmm0, (%rdi)
114; SSE32-NEXT:    retq
115;
116; SSE64-LABEL: test_store_4xf32:
117; SSE64:       # %bb.0:
118; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
119; SSE64-NEXT:    movups %xmm0, (%eax)
120; SSE64-NEXT:    retl
121;
122; AVX32-LABEL: test_store_4xf32:
123; AVX32:       # %bb.0:
124; AVX32-NEXT:    vmovups %xmm0, (%rdi)
125; AVX32-NEXT:    retq
126;
127; AVX64-LABEL: test_store_4xf32:
128; AVX64:       # %bb.0:
129; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
130; AVX64-NEXT:    vmovups %xmm0, (%eax)
131; AVX64-NEXT:    retl
132  store <4 x float> %value, <4 x float>* %addr, align 1
133  ret <4 x float> %value
134}
135
136define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) {
137; SSE32-LABEL: test_store_4xf32_aligned:
138; SSE32:       # %bb.0:
139; SSE32-NEXT:    movaps %xmm0, (%rdi)
140; SSE32-NEXT:    retq
141;
142; SSE64-LABEL: test_store_4xf32_aligned:
143; SSE64:       # %bb.0:
144; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
145; SSE64-NEXT:    movaps %xmm0, (%eax)
146; SSE64-NEXT:    retl
147;
148; AVX32-LABEL: test_store_4xf32_aligned:
149; AVX32:       # %bb.0:
150; AVX32-NEXT:    vmovaps %xmm0, (%rdi)
151; AVX32-NEXT:    retq
152;
153; AVX64-LABEL: test_store_4xf32_aligned:
154; AVX64:       # %bb.0:
155; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
156; AVX64-NEXT:    vmovaps %xmm0, (%eax)
157; AVX64-NEXT:    retl
158  store <4 x float> %value, <4 x float>* %addr, align 16
159  ret <4 x float> %value
160}
161
162define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
163; SSE32-LABEL: test_store_2xf64:
164; SSE32:       # %bb.0:
165; SSE32-NEXT:    addpd %xmm1, %xmm0
166; SSE32-NEXT:    movupd %xmm0, (%rdi)
167; SSE32-NEXT:    retq
168;
169; SSE64-LABEL: test_store_2xf64:
170; SSE64:       # %bb.0:
171; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
172; SSE64-NEXT:    addpd %xmm1, %xmm0
173; SSE64-NEXT:    movupd %xmm0, (%eax)
174; SSE64-NEXT:    retl
175;
176; AVX32-LABEL: test_store_2xf64:
177; AVX32:       # %bb.0:
178; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
179; AVX32-NEXT:    vmovupd %xmm0, (%rdi)
180; AVX32-NEXT:    retq
181;
182; AVX64-LABEL: test_store_2xf64:
183; AVX64:       # %bb.0:
184; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
185; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
186; AVX64-NEXT:    vmovupd %xmm0, (%eax)
187; AVX64-NEXT:    retl
188  %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
189  store <2 x double> %foo, <2 x double>* %addr, align 1
190  ret <2 x double> %foo
191}
192
193define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
194; SSE32-LABEL: test_store_2xf64_aligned:
195; SSE32:       # %bb.0:
196; SSE32-NEXT:    addpd %xmm1, %xmm0
197; SSE32-NEXT:    movapd %xmm0, (%rdi)
198; SSE32-NEXT:    retq
199;
200; SSE64-LABEL: test_store_2xf64_aligned:
201; SSE64:       # %bb.0:
202; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
203; SSE64-NEXT:    addpd %xmm1, %xmm0
204; SSE64-NEXT:    movapd %xmm0, (%eax)
205; SSE64-NEXT:    retl
206;
207; AVX32-LABEL: test_store_2xf64_aligned:
208; AVX32:       # %bb.0:
209; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
210; AVX32-NEXT:    vmovapd %xmm0, (%rdi)
211; AVX32-NEXT:    retq
212;
213; AVX64-LABEL: test_store_2xf64_aligned:
214; AVX64:       # %bb.0:
215; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
216; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
217; AVX64-NEXT:    vmovapd %xmm0, (%eax)
218; AVX64-NEXT:    retl
219  %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
220  store <2 x double> %foo, <2 x double>* %addr, align 16
221  ret <2 x double> %foo
222}
223
224define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) {
225; SSE32-LABEL: test_store_8xi32:
226; SSE32:       # %bb.0:
227; SSE32-NEXT:    movups %xmm0, (%rdi)
228; SSE32-NEXT:    movups %xmm1, 16(%rdi)
229; SSE32-NEXT:    retq
230;
231; SSE64-LABEL: test_store_8xi32:
232; SSE64:       # %bb.0:
233; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
234; SSE64-NEXT:    movups %xmm0, (%eax)
235; SSE64-NEXT:    movups %xmm1, 16(%eax)
236; SSE64-NEXT:    retl
237;
238; AVX32-LABEL: test_store_8xi32:
239; AVX32:       # %bb.0:
240; AVX32-NEXT:    vmovups %ymm0, (%rdi)
241; AVX32-NEXT:    retq
242;
243; AVX64-LABEL: test_store_8xi32:
244; AVX64:       # %bb.0:
245; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
246; AVX64-NEXT:    vmovups %ymm0, (%eax)
247; AVX64-NEXT:    retl
248  store <8 x i32> %value, <8 x i32>* %addr, align 1
249  ret <8 x i32> %value
250}
251
252define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) {
253; SSE32-LABEL: test_store_8xi32_aligned:
254; SSE32:       # %bb.0:
255; SSE32-NEXT:    movaps %xmm0, (%rdi)
256; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
257; SSE32-NEXT:    retq
258;
259; SSE64-LABEL: test_store_8xi32_aligned:
260; SSE64:       # %bb.0:
261; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
262; SSE64-NEXT:    movaps %xmm0, (%eax)
263; SSE64-NEXT:    movaps %xmm1, 16(%eax)
264; SSE64-NEXT:    retl
265;
266; AVX32-LABEL: test_store_8xi32_aligned:
267; AVX32:       # %bb.0:
268; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
269; AVX32-NEXT:    retq
270;
271; AVX64-LABEL: test_store_8xi32_aligned:
272; AVX64:       # %bb.0:
273; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
274; AVX64-NEXT:    vmovaps %ymm0, (%eax)
275; AVX64-NEXT:    retl
276  store <8 x i32> %value, <8 x i32>* %addr, align 32
277  ret <8 x i32> %value
278}
279
280define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) {
281; SSE32-LABEL: test_store_8xf32:
282; SSE32:       # %bb.0:
283; SSE32-NEXT:    movups %xmm0, (%rdi)
284; SSE32-NEXT:    movups %xmm1, 16(%rdi)
285; SSE32-NEXT:    retq
286;
287; SSE64-LABEL: test_store_8xf32:
288; SSE64:       # %bb.0:
289; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
290; SSE64-NEXT:    movups %xmm0, (%eax)
291; SSE64-NEXT:    movups %xmm1, 16(%eax)
292; SSE64-NEXT:    retl
293;
294; AVX32-LABEL: test_store_8xf32:
295; AVX32:       # %bb.0:
296; AVX32-NEXT:    vmovups %ymm0, (%rdi)
297; AVX32-NEXT:    retq
298;
299; AVX64-LABEL: test_store_8xf32:
300; AVX64:       # %bb.0:
301; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
302; AVX64-NEXT:    vmovups %ymm0, (%eax)
303; AVX64-NEXT:    retl
304  store <8 x float> %value, <8 x float>* %addr, align 1
305  ret <8 x float> %value
306}
307
308define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) {
309; SSE32-LABEL: test_store_8xf32_aligned:
310; SSE32:       # %bb.0:
311; SSE32-NEXT:    movaps %xmm0, (%rdi)
312; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
313; SSE32-NEXT:    retq
314;
315; SSE64-LABEL: test_store_8xf32_aligned:
316; SSE64:       # %bb.0:
317; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
318; SSE64-NEXT:    movaps %xmm0, (%eax)
319; SSE64-NEXT:    movaps %xmm1, 16(%eax)
320; SSE64-NEXT:    retl
321;
322; AVX32-LABEL: test_store_8xf32_aligned:
323; AVX32:       # %bb.0:
324; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
325; AVX32-NEXT:    retq
326;
327; AVX64-LABEL: test_store_8xf32_aligned:
328; AVX64:       # %bb.0:
329; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
330; AVX64-NEXT:    vmovaps %ymm0, (%eax)
331; AVX64-NEXT:    retl
332  store <8 x float> %value, <8 x float>* %addr, align 32
333  ret <8 x float> %value
334}
335
336define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
337; SSE32-LABEL: test_store_4xf64:
338; SSE32:       # %bb.0:
339; SSE32-NEXT:    addpd %xmm2, %xmm0
340; SSE32-NEXT:    movupd %xmm0, (%rdi)
341; SSE32-NEXT:    addpd %xmm3, %xmm1
342; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
343; SSE32-NEXT:    retq
344;
345; SSE64-LABEL: test_store_4xf64:
346; SSE64:       # %bb.0:
347; SSE64-NEXT:    subl $12, %esp
348; SSE64-NEXT:    .cfi_def_cfa_offset 16
349; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
350; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
351; SSE64-NEXT:    addpd %xmm2, %xmm0
352; SSE64-NEXT:    movupd %xmm0, (%eax)
353; SSE64-NEXT:    addpd %xmm3, %xmm1
354; SSE64-NEXT:    movupd %xmm1, 16(%eax)
355; SSE64-NEXT:    addl $12, %esp
356; SSE64-NEXT:    .cfi_def_cfa_offset 4
357; SSE64-NEXT:    retl
358;
359; AVX32-LABEL: test_store_4xf64:
360; AVX32:       # %bb.0:
361; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
362; AVX32-NEXT:    vmovupd %ymm0, (%rdi)
363; AVX32-NEXT:    retq
364;
365; AVX64-LABEL: test_store_4xf64:
366; AVX64:       # %bb.0:
367; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
368; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
369; AVX64-NEXT:    vmovupd %ymm0, (%eax)
370; AVX64-NEXT:    retl
371  %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
372  store <4 x double> %foo, <4 x double>* %addr, align 1
373  ret <4 x double> %foo
374}
375
376define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
377; SSE32-LABEL: test_store_4xf64_aligned:
378; SSE32:       # %bb.0:
379; SSE32-NEXT:    addpd %xmm2, %xmm0
380; SSE32-NEXT:    movapd %xmm0, (%rdi)
381; SSE32-NEXT:    addpd %xmm3, %xmm1
382; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
383; SSE32-NEXT:    retq
384;
385; SSE64-LABEL: test_store_4xf64_aligned:
386; SSE64:       # %bb.0:
387; SSE64-NEXT:    subl $12, %esp
388; SSE64-NEXT:    .cfi_def_cfa_offset 16
389; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
390; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
391; SSE64-NEXT:    addpd %xmm2, %xmm0
392; SSE64-NEXT:    movapd %xmm0, (%eax)
393; SSE64-NEXT:    addpd %xmm3, %xmm1
394; SSE64-NEXT:    movapd %xmm1, 16(%eax)
395; SSE64-NEXT:    addl $12, %esp
396; SSE64-NEXT:    .cfi_def_cfa_offset 4
397; SSE64-NEXT:    retl
398;
399; AVX32-LABEL: test_store_4xf64_aligned:
400; AVX32:       # %bb.0:
401; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
402; AVX32-NEXT:    vmovapd %ymm0, (%rdi)
403; AVX32-NEXT:    retq
404;
405; AVX64-LABEL: test_store_4xf64_aligned:
406; AVX64:       # %bb.0:
407; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
408; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
409; AVX64-NEXT:    vmovapd %ymm0, (%eax)
410; AVX64-NEXT:    retl
411  %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
412  store <4 x double> %foo, <4 x double>* %addr, align 32
413  ret <4 x double> %foo
414}
415
416define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) {
417; SSE32-LABEL: test_store_16xi32:
418; SSE32:       # %bb.0:
419; SSE32-NEXT:    movups %xmm0, (%rdi)
420; SSE32-NEXT:    movups %xmm1, 16(%rdi)
421; SSE32-NEXT:    movups %xmm2, 32(%rdi)
422; SSE32-NEXT:    movups %xmm3, 48(%rdi)
423; SSE32-NEXT:    retq
424;
425; SSE64-LABEL: test_store_16xi32:
426; SSE64:       # %bb.0:
427; SSE64-NEXT:    subl $12, %esp
428; SSE64-NEXT:    .cfi_def_cfa_offset 16
429; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
430; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
431; SSE64-NEXT:    movups %xmm0, (%eax)
432; SSE64-NEXT:    movups %xmm1, 16(%eax)
433; SSE64-NEXT:    movups %xmm2, 32(%eax)
434; SSE64-NEXT:    movups %xmm3, 48(%eax)
435; SSE64-NEXT:    addl $12, %esp
436; SSE64-NEXT:    .cfi_def_cfa_offset 4
437; SSE64-NEXT:    retl
438;
439; AVXONLY32-LABEL: test_store_16xi32:
440; AVXONLY32:       # %bb.0:
441; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
442; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
443; AVXONLY32-NEXT:    retq
444;
445; AVXONLY64-LABEL: test_store_16xi32:
446; AVXONLY64:       # %bb.0:
447; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
448; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
449; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
450; AVXONLY64-NEXT:    retl
451;
452; AVX51232-LABEL: test_store_16xi32:
453; AVX51232:       # %bb.0:
454; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
455; AVX51232-NEXT:    retq
456;
457; AVX51264-LABEL: test_store_16xi32:
458; AVX51264:       # %bb.0:
459; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
460; AVX51264-NEXT:    vmovups %zmm0, (%eax)
461; AVX51264-NEXT:    retl
462  store <16 x i32> %value, <16 x i32>* %addr, align 1
463  ret <16 x i32> %value
464}
465
466define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) {
467; SSE32-LABEL: test_store_16xi32_aligned:
468; SSE32:       # %bb.0:
469; SSE32-NEXT:    movaps %xmm0, (%rdi)
470; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
471; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
472; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
473; SSE32-NEXT:    retq
474;
475; SSE64-LABEL: test_store_16xi32_aligned:
476; SSE64:       # %bb.0:
477; SSE64-NEXT:    subl $12, %esp
478; SSE64-NEXT:    .cfi_def_cfa_offset 16
479; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
480; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
481; SSE64-NEXT:    movaps %xmm0, (%eax)
482; SSE64-NEXT:    movaps %xmm1, 16(%eax)
483; SSE64-NEXT:    movaps %xmm2, 32(%eax)
484; SSE64-NEXT:    movaps %xmm3, 48(%eax)
485; SSE64-NEXT:    addl $12, %esp
486; SSE64-NEXT:    .cfi_def_cfa_offset 4
487; SSE64-NEXT:    retl
488;
489; AVXONLY32-LABEL: test_store_16xi32_aligned:
490; AVXONLY32:       # %bb.0:
491; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
492; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
493; AVXONLY32-NEXT:    retq
494;
495; AVXONLY64-LABEL: test_store_16xi32_aligned:
496; AVXONLY64:       # %bb.0:
497; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
498; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
499; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
500; AVXONLY64-NEXT:    retl
501;
502; AVX51232-LABEL: test_store_16xi32_aligned:
503; AVX51232:       # %bb.0:
504; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
505; AVX51232-NEXT:    retq
506;
507; AVX51264-LABEL: test_store_16xi32_aligned:
508; AVX51264:       # %bb.0:
509; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
510; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
511; AVX51264-NEXT:    retl
512  store <16 x i32> %value, <16 x i32>* %addr, align 64
513  ret <16 x i32> %value
514}
515
516define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) {
517; SSE32-LABEL: test_store_16xf32:
518; SSE32:       # %bb.0:
519; SSE32-NEXT:    movups %xmm0, (%rdi)
520; SSE32-NEXT:    movups %xmm1, 16(%rdi)
521; SSE32-NEXT:    movups %xmm2, 32(%rdi)
522; SSE32-NEXT:    movups %xmm3, 48(%rdi)
523; SSE32-NEXT:    retq
524;
525; SSE64-LABEL: test_store_16xf32:
526; SSE64:       # %bb.0:
527; SSE64-NEXT:    subl $12, %esp
528; SSE64-NEXT:    .cfi_def_cfa_offset 16
529; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
530; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
531; SSE64-NEXT:    movups %xmm0, (%eax)
532; SSE64-NEXT:    movups %xmm1, 16(%eax)
533; SSE64-NEXT:    movups %xmm2, 32(%eax)
534; SSE64-NEXT:    movups %xmm3, 48(%eax)
535; SSE64-NEXT:    addl $12, %esp
536; SSE64-NEXT:    .cfi_def_cfa_offset 4
537; SSE64-NEXT:    retl
538;
539; AVXONLY32-LABEL: test_store_16xf32:
540; AVXONLY32:       # %bb.0:
541; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
542; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
543; AVXONLY32-NEXT:    retq
544;
545; AVXONLY64-LABEL: test_store_16xf32:
546; AVXONLY64:       # %bb.0:
547; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
548; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
549; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
550; AVXONLY64-NEXT:    retl
551;
552; AVX51232-LABEL: test_store_16xf32:
553; AVX51232:       # %bb.0:
554; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
555; AVX51232-NEXT:    retq
556;
557; AVX51264-LABEL: test_store_16xf32:
558; AVX51264:       # %bb.0:
559; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
560; AVX51264-NEXT:    vmovups %zmm0, (%eax)
561; AVX51264-NEXT:    retl
562  store <16 x float> %value, <16 x float>* %addr, align 1
563  ret <16 x float> %value
564}
565
566define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) {
567; SSE32-LABEL: test_store_16xf32_aligned:
568; SSE32:       # %bb.0:
569; SSE32-NEXT:    movaps %xmm0, (%rdi)
570; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
571; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
572; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
573; SSE32-NEXT:    retq
574;
575; SSE64-LABEL: test_store_16xf32_aligned:
576; SSE64:       # %bb.0:
577; SSE64-NEXT:    subl $12, %esp
578; SSE64-NEXT:    .cfi_def_cfa_offset 16
579; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
580; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
581; SSE64-NEXT:    movaps %xmm0, (%eax)
582; SSE64-NEXT:    movaps %xmm1, 16(%eax)
583; SSE64-NEXT:    movaps %xmm2, 32(%eax)
584; SSE64-NEXT:    movaps %xmm3, 48(%eax)
585; SSE64-NEXT:    addl $12, %esp
586; SSE64-NEXT:    .cfi_def_cfa_offset 4
587; SSE64-NEXT:    retl
588;
589; AVXONLY32-LABEL: test_store_16xf32_aligned:
590; AVXONLY32:       # %bb.0:
591; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
592; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
593; AVXONLY32-NEXT:    retq
594;
595; AVXONLY64-LABEL: test_store_16xf32_aligned:
596; AVXONLY64:       # %bb.0:
597; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
598; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
599; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
600; AVXONLY64-NEXT:    retl
601;
602; AVX51232-LABEL: test_store_16xf32_aligned:
603; AVX51232:       # %bb.0:
604; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
605; AVX51232-NEXT:    retq
606;
607; AVX51264-LABEL: test_store_16xf32_aligned:
608; AVX51264:       # %bb.0:
609; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
610; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
611; AVX51264-NEXT:    retl
612  store <16 x float> %value, <16 x float>* %addr, align 64
613  ret <16 x float> %value
614}
615
616define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
617; SSE32-LABEL: test_store_8xf64:
618; SSE32:       # %bb.0:
619; SSE32-NEXT:    addpd %xmm4, %xmm0
620; SSE32-NEXT:    movupd %xmm0, (%rdi)
621; SSE32-NEXT:    addpd %xmm5, %xmm1
622; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
623; SSE32-NEXT:    addpd %xmm6, %xmm2
624; SSE32-NEXT:    movupd %xmm2, 32(%rdi)
625; SSE32-NEXT:    addpd %xmm7, %xmm3
626; SSE32-NEXT:    movupd %xmm3, 48(%rdi)
627; SSE32-NEXT:    retq
628;
629; SSE64-LABEL: test_store_8xf64:
630; SSE64:       # %bb.0:
631; SSE64-NEXT:    subl $12, %esp
632; SSE64-NEXT:    .cfi_def_cfa_offset 16
633; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm4
634; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm5
635; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm6
636; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
637; SSE64-NEXT:    addpd %xmm4, %xmm3
638; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
639; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
640; SSE64-NEXT:    movupd %xmm0, (%eax)
641; SSE64-NEXT:    addpd %xmm6, %xmm1
642; SSE64-NEXT:    movupd %xmm1, 16(%eax)
643; SSE64-NEXT:    addpd %xmm5, %xmm2
644; SSE64-NEXT:    movupd %xmm2, 32(%eax)
645; SSE64-NEXT:    movupd %xmm3, 48(%eax)
646; SSE64-NEXT:    addl $12, %esp
647; SSE64-NEXT:    .cfi_def_cfa_offset 4
648; SSE64-NEXT:    retl
649;
650; AVXONLY32-LABEL: test_store_8xf64:
651; AVXONLY32:       # %bb.0:
652; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
653; AVXONLY32-NEXT:    vmovupd %ymm0, (%rdi)
654; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
655; AVXONLY32-NEXT:    vmovupd %ymm1, 32(%rdi)
656; AVXONLY32-NEXT:    retq
657;
658; AVXONLY64-LABEL: test_store_8xf64:
659; AVXONLY64:       # %bb.0:
660; AVXONLY64-NEXT:    pushl %ebp
661; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
662; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
663; AVXONLY64-NEXT:    movl %esp, %ebp
664; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
665; AVXONLY64-NEXT:    andl $-32, %esp
666; AVXONLY64-NEXT:    subl $32, %esp
667; AVXONLY64-NEXT:    vmovapd 40(%ebp), %ymm3
668; AVXONLY64-NEXT:    movl 8(%ebp), %eax
669; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
670; AVXONLY64-NEXT:    vmovupd %ymm0, (%eax)
671; AVXONLY64-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
672; AVXONLY64-NEXT:    vmovupd %ymm1, 32(%eax)
673; AVXONLY64-NEXT:    movl %ebp, %esp
674; AVXONLY64-NEXT:    popl %ebp
675; AVXONLY64-NEXT:    .cfi_def_cfa %esp, 4
676; AVXONLY64-NEXT:    retl
677;
678; AVX51232-LABEL: test_store_8xf64:
679; AVX51232:       # %bb.0:
680; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
681; AVX51232-NEXT:    vmovupd %zmm0, (%rdi)
682; AVX51232-NEXT:    retq
683;
684; AVX51264-LABEL: test_store_8xf64:
685; AVX51264:       # %bb.0:
686; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
687; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
688; AVX51264-NEXT:    vmovupd %zmm0, (%eax)
689; AVX51264-NEXT:    retl
690  %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
691  store <8 x double> %foo, <8 x double>* %addr, align 1
692  ret <8 x double> %foo
693}
694
695define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
696; SSE32-LABEL: test_store_8xf64_aligned:
697; SSE32:       # %bb.0:
698; SSE32-NEXT:    addpd %xmm4, %xmm0
699; SSE32-NEXT:    movapd %xmm0, (%rdi)
700; SSE32-NEXT:    addpd %xmm5, %xmm1
701; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
702; SSE32-NEXT:    addpd %xmm6, %xmm2
703; SSE32-NEXT:    movapd %xmm2, 32(%rdi)
704; SSE32-NEXT:    addpd %xmm7, %xmm3
705; SSE32-NEXT:    movapd %xmm3, 48(%rdi)
706; SSE32-NEXT:    retq
707;
708; SSE64-LABEL: test_store_8xf64_aligned:
709; SSE64:       # %bb.0:
710; SSE64-NEXT:    subl $12, %esp
711; SSE64-NEXT:    .cfi_def_cfa_offset 16
712; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm4
713; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm5
714; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm6
715; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
716; SSE64-NEXT:    addpd %xmm4, %xmm3
717; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
718; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
719; SSE64-NEXT:    movapd %xmm0, (%eax)
720; SSE64-NEXT:    addpd %xmm6, %xmm1
721; SSE64-NEXT:    movapd %xmm1, 16(%eax)
722; SSE64-NEXT:    addpd %xmm5, %xmm2
723; SSE64-NEXT:    movapd %xmm2, 32(%eax)
724; SSE64-NEXT:    movapd %xmm3, 48(%eax)
725; SSE64-NEXT:    addl $12, %esp
726; SSE64-NEXT:    .cfi_def_cfa_offset 4
727; SSE64-NEXT:    retl
728;
729; AVXONLY32-LABEL: test_store_8xf64_aligned:
730; AVXONLY32:       # %bb.0:
731; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
732; AVXONLY32-NEXT:    vmovapd %ymm0, (%rdi)
733; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
734; AVXONLY32-NEXT:    vmovapd %ymm1, 32(%rdi)
735; AVXONLY32-NEXT:    retq
736;
737; AVXONLY64-LABEL: test_store_8xf64_aligned:
738; AVXONLY64:       # %bb.0:
739; AVXONLY64-NEXT:    pushl %ebp
740; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
741; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
742; AVXONLY64-NEXT:    movl %esp, %ebp
743; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
744; AVXONLY64-NEXT:    andl $-32, %esp
745; AVXONLY64-NEXT:    subl $32, %esp
746; AVXONLY64-NEXT:    vmovapd 40(%ebp), %ymm3
747; AVXONLY64-NEXT:    movl 8(%ebp), %eax
748; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
749; AVXONLY64-NEXT:    vmovapd %ymm0, (%eax)
750; AVXONLY64-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
751; AVXONLY64-NEXT:    vmovapd %ymm1, 32(%eax)
752; AVXONLY64-NEXT:    movl %ebp, %esp
753; AVXONLY64-NEXT:    popl %ebp
754; AVXONLY64-NEXT:    .cfi_def_cfa %esp, 4
755; AVXONLY64-NEXT:    retl
756;
757; AVX51232-LABEL: test_store_8xf64_aligned:
758; AVX51232:       # %bb.0:
759; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
760; AVX51232-NEXT:    vmovapd %zmm0, (%rdi)
761; AVX51232-NEXT:    retq
762;
763; AVX51264-LABEL: test_store_8xf64_aligned:
764; AVX51264:       # %bb.0:
765; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
766; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
767; AVX51264-NEXT:    vmovapd %zmm0, (%eax)
768; AVX51264-NEXT:    retl
769  %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
770  store <8 x double> %foo, <8 x double>* %addr, align 64
771  ret <8 x double> %foo
772}
773