1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
5
6; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
7; because that is slower than two 16-byte loads.
8; Other AVX-capable chips don't have that problem.
9
10define <8 x float> @load32bytes(<8 x float>* %Ap) {
11  ; CHECK-LABEL: load32bytes
12
13  ; SANDYB: vmovaps
14  ; SANDYB: vinsertf128
15  ; SANDYB: retq
16
17  ; BTVER2: vmovups
18  ; BTVER2: retq
19
20  ; HASWELL: vmovups
21  ; HASWELL: retq
22
23  %A = load <8 x float>, <8 x float>* %Ap, align 16
24  ret <8 x float> %A
25}
26
27; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
28; because that is slowerthan two 16-byte stores.
29; Other AVX-capable chips don't have that problem.
30
31define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
32  ; CHECK-LABEL: store32bytes
33
34  ; SANDYB: vextractf128
35  ; SANDYB: vmovaps
36  ; SANDYB: retq
37
38  ; BTVER2: vmovups
39  ; BTVER2: retq
40
41  ; HASWELL: vmovups
42  ; HASWELL: retq
43
44  store <8 x float> %A, <8 x float>* %P, align 16
45  ret void
46}
47
48; Merge two consecutive 16-byte subvector loads into a single 32-byte load
49; if it's faster.
50
51define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
52  ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
53
54  ; SANDYB: vmovups
55  ; SANDYB-NEXT: vinsertf128
56  ; SANDYB-NEXT: retq
57
58  ; BTVER2: vmovups
59  ; BTVER2-NEXT: retq
60
61  ; HASWELL: vmovups
62  ; HASWELL-NEXT: retq
63
64  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
65  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
66  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
67  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
68  %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
69  ret <8 x float> %v3
70}
71
72; Swap the order of the shufflevector operands to ensure that the
73; pattern still matches.
74define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
75  ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
76
77  ; SANDYB: vmovups
78  ; SANDYB-NEXT: vinsertf128
79  ; SANDYB-NEXT: retq
80
81  ; BTVER2: vmovups
82  ; BTVER2-NEXT: retq
83
84  ; HASWELL: vmovups
85  ; HASWELL-NEXT: retq
86
87  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
88  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
89  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
90  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
91  %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
92  ret <8 x float> %v3
93}
94
95; Check each element type other than float to make sure it is handled correctly.
96; Use the loaded values with an 'add' to make sure we're using the correct load type.
97; Even though BtVer2 has fast 32-byte loads, we should not generate those for
98; 256-bit integer vectors because BtVer2 doesn't have AVX2.
99
100define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
101  ; CHECK-LABEL: combine_16_byte_loads_i64
102
103  ; SANDYB: vextractf128
104  ; SANDYB-NEXT: vpaddq
105  ; SANDYB-NEXT: vpaddq
106  ; SANDYB-NEXT: vinsertf128
107  ; SANDYB-NEXT: retq
108
109  ; BTVER2: vextractf128
110  ; BTVER2-NEXT: vpaddq
111  ; BTVER2-NEXT: vpaddq
112  ; BTVER2-NEXT: vinsertf128
113  ; BTVER2-NEXT: retq
114
115  ; HASWELL-NOT: vextract
116  ; HASWELL: vpaddq
117  ; HASWELL-NEXT: retq
118
119  %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5
120  %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6
121  %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1
122  %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1
123  %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
124  %v4 = add <4 x i64> %v3, %x
125  ret <4 x i64> %v4
126}
127
128define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
129  ; CHECK-LABEL: combine_16_byte_loads_i32
130
131  ; SANDYB: vextractf128
132  ; SANDYB-NEXT: vpaddd
133  ; SANDYB-NEXT: vpaddd
134  ; SANDYB-NEXT: vinsertf128
135  ; SANDYB-NEXT: retq
136
137  ; BTVER2: vextractf128
138  ; BTVER2-NEXT: vpaddd
139  ; BTVER2-NEXT: vpaddd
140  ; BTVER2-NEXT: vinsertf128
141  ; BTVER2-NEXT: retq
142
143  ; HASWELL-NOT: vextract
144  ; HASWELL: vpaddd
145  ; HASWELL-NEXT: retq
146
147  %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6
148  %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7
149  %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1
150  %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1
151  %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
152  %v4 = add <8 x i32> %v3, %x
153  ret <8 x i32> %v4
154}
155
156define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
157  ; CHECK-LABEL: combine_16_byte_loads_i16
158
159  ; SANDYB: vextractf128
160  ; SANDYB-NEXT: vpaddw
161  ; SANDYB-NEXT: vpaddw
162  ; SANDYB-NEXT: vinsertf128
163  ; SANDYB-NEXT: retq
164
165  ; BTVER2: vextractf128
166  ; BTVER2-NEXT: vpaddw
167  ; BTVER2-NEXT: vpaddw
168  ; BTVER2-NEXT: vinsertf128
169  ; BTVER2-NEXT: retq
170
171  ; HASWELL-NOT: vextract
172  ; HASWELL: vpaddw
173  ; HASWELL-NEXT: retq
174
175  %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7
176  %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8
177  %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1
178  %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1
179  %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
180  %v4 = add <16 x i16> %v3, %x
181  ret <16 x i16> %v4
182}
183
184define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
185  ; CHECK-LABEL: combine_16_byte_loads_i8
186
187  ; SANDYB: vextractf128
188  ; SANDYB-NEXT: vpaddb
189  ; SANDYB-NEXT: vpaddb
190  ; SANDYB-NEXT: vinsertf128
191  ; SANDYB-NEXT: retq
192
193  ; BTVER2: vextractf128
194  ; BTVER2-NEXT: vpaddb
195  ; BTVER2-NEXT: vpaddb
196  ; BTVER2-NEXT: vinsertf128
197  ; BTVER2-NEXT: retq
198
199  ; HASWELL-NOT: vextract
200  ; HASWELL: vpaddb
201  ; HASWELL-NEXT: retq
202
203  %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8
204  %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9
205  %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1
206  %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1
207  %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
208  %v4 = add <32 x i8> %v3, %x
209  ret <32 x i8> %v4
210}
211
212define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
213  ; CHECK-LABEL: combine_16_byte_loads_double
214
215  ; SANDYB: vmovupd
216  ; SANDYB-NEXT: vinsertf128
217  ; SANDYB-NEXT: vaddpd
218  ; SANDYB-NEXT: retq
219
220  ; BTVER2-NOT: vinsertf128
221  ; BTVER2: vaddpd
222  ; BTVER2-NEXT: retq
223
224  ; HASWELL-NOT: vinsertf128
225  ; HASWELL: vaddpd
226  ; HASWELL-NEXT: retq
227
228  %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9
229  %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10
230  %v1 = load <2 x double>, <2 x double>* %ptr1, align 1
231  %v2 = load <2 x double>, <2 x double>* %ptr2, align 1
232  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
233  %v4 = fadd <4 x double> %v3, %x
234  ret <4 x double> %v4
235}
236
237