1; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
2; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
3
4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
5target triple = "x86_64-apple-macosx10.8.0"
6
7%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
8%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
9
10; CHECK: merge_const_store
11; save 1,2,3 ... as one big integer.
12; CHECK: movabsq $578437695752307201
13; CHECK: ret
14define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
15  %1 = icmp sgt i32 %count, 0
16  br i1 %1, label %.lr.ph, label %._crit_edge
17.lr.ph:
18  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
19  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
20  %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
21  store i8 1, i8* %2, align 1
22  %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
23  store i8 2, i8* %3, align 1
24  %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
25  store i8 3, i8* %4, align 1
26  %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
27  store i8 4, i8* %5, align 1
28  %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
29  store i8 5, i8* %6, align 1
30  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
31  store i8 6, i8* %7, align 1
32  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
33  store i8 7, i8* %8, align 1
34  %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
35  store i8 8, i8* %9, align 1
36  %10 = add nsw i32 %i.02, 1
37  %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
38  %exitcond = icmp eq i32 %10, %count
39  br i1 %exitcond, label %._crit_edge, label %.lr.ph
40._crit_edge:
41  ret void
42}
43
44; No vectors because we use noimplicitfloat
45; CHECK: merge_const_store_no_vec
46; CHECK-NOT: vmovups
47; CHECK: ret
48define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
49  %1 = icmp sgt i32 %count, 0
50  br i1 %1, label %.lr.ph, label %._crit_edge
51.lr.ph:
52  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
53  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
54  %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
55  store i32 0, i32* %2, align 4
56  %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
57  store i32 0, i32* %3, align 4
58  %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
59  store i32 0, i32* %4, align 4
60  %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
61  store i32 0, i32* %5, align 4
62  %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
63  store i32 0, i32* %6, align 4
64  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
65  store i32 0, i32* %7, align 4
66  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
67  store i32 0, i32* %8, align 4
68  %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
69  store i32 0, i32* %9, align 4
70  %10 = add nsw i32 %i.02, 1
71  %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
72  %exitcond = icmp eq i32 %10, %count
73  br i1 %exitcond, label %._crit_edge, label %.lr.ph
74._crit_edge:
75  ret void
76}
77
78; Move the constants using a single vector store.
79; CHECK: merge_const_store_vec
80; CHECK: vmovups
81; CHECK: ret
82define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
83  %1 = icmp sgt i32 %count, 0
84  br i1 %1, label %.lr.ph, label %._crit_edge
85.lr.ph:
86  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
87  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
88  %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
89  store i32 0, i32* %2, align 4
90  %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
91  store i32 0, i32* %3, align 4
92  %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
93  store i32 0, i32* %4, align 4
94  %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
95  store i32 0, i32* %5, align 4
96  %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
97  store i32 0, i32* %6, align 4
98  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
99  store i32 0, i32* %7, align 4
100  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
101  store i32 0, i32* %8, align 4
102  %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
103  store i32 0, i32* %9, align 4
104  %10 = add nsw i32 %i.02, 1
105  %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
106  %exitcond = icmp eq i32 %10, %count
107  br i1 %exitcond, label %._crit_edge, label %.lr.ph
108._crit_edge:
109  ret void
110}
111
112; Move the first 4 constants as a single vector. Move the rest as scalars.
113; CHECK: merge_nonconst_store
114; CHECK: movl $67305985
115; CHECK: movb
116; CHECK: movb
117; CHECK: movb
118; CHECK: movb
119; CHECK: ret
120define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
121  %1 = icmp sgt i32 %count, 0
122  br i1 %1, label %.lr.ph, label %._crit_edge
123.lr.ph:
124  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
125  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
126  %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
127  store i8 1, i8* %2, align 1
128  %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
129  store i8 2, i8* %3, align 1
130  %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
131  store i8 3, i8* %4, align 1
132  %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
133  store i8 4, i8* %5, align 1
134  %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
135  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
136  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
137  store i8 6, i8* %7, align 1
138  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
139  store i8 7, i8* %8, align 1
140  %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
141  store i8 8, i8* %9, align 1
142  %10 = add nsw i32 %i.02, 1
143  %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
144  %exitcond = icmp eq i32 %10, %count
145  br i1 %exitcond, label %._crit_edge, label %.lr.ph
146._crit_edge:
147  ret void
148}
149
150
151; CHECK-LABEL: merge_loads_i16:
152;  load:
153; CHECK: movw
154;  store:
155; CHECK: movw
156; CHECK: ret
157define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
158  %1 = icmp sgt i32 %count, 0
159  br i1 %1, label %.lr.ph, label %._crit_edge
160
161.lr.ph:                                           ; preds = %0
162  %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
163  %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
164  br label %4
165
166; <label>:4                                       ; preds = %4, %.lr.ph
167  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
168  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
169  %5 = load i8, i8* %2, align 1
170  %6 = load i8, i8* %3, align 1
171  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
172  store i8 %5, i8* %7, align 1
173  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
174  store i8 %6, i8* %8, align 1
175  %9 = add nsw i32 %i.02, 1
176  %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
177  %exitcond = icmp eq i32 %9, %count
178  br i1 %exitcond, label %._crit_edge, label %4
179
180._crit_edge:                                      ; preds = %4, %0
181  ret void
182}
183
184; The loads and the stores are interleaved. Can't merge them.
185; CHECK-LABEL: no_merge_loads:
186; CHECK: movb
187; CHECK: movb
188; CHECK: movb
189; CHECK: movb
190; CHECK: ret
191define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
192  %1 = icmp sgt i32 %count, 0
193  br i1 %1, label %.lr.ph, label %._crit_edge
194
195.lr.ph:                                           ; preds = %0
196  %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
197  %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
198  br label %a4
199
200a4:                                       ; preds = %4, %.lr.ph
201  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
202  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
203  %a5 = load i8, i8* %2, align 1
204  %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
205  store i8 %a5, i8* %a7, align 1
206  %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
207  %a6 = load i8, i8* %3, align 1
208  store i8 %a6, i8* %a8, align 1
209  %a9 = add nsw i32 %i.02, 1
210  %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
211  %exitcond = icmp eq i32 %a9, %count
212  br i1 %exitcond, label %._crit_edge, label %a4
213
214._crit_edge:                                      ; preds = %4, %0
215  ret void
216}
217
218
219; CHECK-LABEL: merge_loads_integer:
220;  load:
221; CHECK: movq
222;  store:
223; CHECK: movq
224; CHECK: ret
225define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
226  %1 = icmp sgt i32 %count, 0
227  br i1 %1, label %.lr.ph, label %._crit_edge
228
229.lr.ph:                                           ; preds = %0
230  %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
231  %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
232  br label %4
233
234; <label>:4                                       ; preds = %4, %.lr.ph
235  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
236  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
237  %5 = load i32, i32* %2
238  %6 = load i32, i32* %3
239  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
240  store i32 %5, i32* %7
241  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
242  store i32 %6, i32* %8
243  %9 = add nsw i32 %i.02, 1
244  %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
245  %exitcond = icmp eq i32 %9, %count
246  br i1 %exitcond, label %._crit_edge, label %4
247
248._crit_edge:                                      ; preds = %4, %0
249  ret void
250}
251
252
253; CHECK-LABEL: merge_loads_vector:
254;  load:
255; CHECK: movups
256;  store:
257; CHECK: movups
258; CHECK: ret
259define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
260  %a1 = icmp sgt i32 %count, 0
261  br i1 %a1, label %.lr.ph, label %._crit_edge
262
263.lr.ph:                                           ; preds = %0
264  %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
265  %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
266  %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
267  %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
268  br label %block4
269
270block4:                                       ; preds = %4, %.lr.ph
271  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
272  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
273  %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
274  %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
275  %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
276  %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
277  %b1 = load i32, i32* %a2
278  %b2 = load i32, i32* %a3
279  %b3 = load i32, i32* %a4
280  %b4 = load i32, i32* %a5
281  store i32 %b1, i32* %a7
282  store i32 %b2, i32* %a8
283  store i32 %b3, i32* %a9
284  store i32 %b4, i32* %a10
285  %c9 = add nsw i32 %i.02, 1
286  %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
287  %exitcond = icmp eq i32 %c9, %count
288  br i1 %exitcond, label %._crit_edge, label %block4
289
290._crit_edge:                                      ; preds = %4, %0
291  ret void
292}
293
294; CHECK-LABEL: merge_loads_no_align:
295;  load:
296; CHECK: movl
297; CHECK: movl
298; CHECK: movl
299; CHECK: movl
300;  store:
301; CHECK: movl
302; CHECK: movl
303; CHECK: movl
304; CHECK: movl
305; CHECK: ret
306define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
307  %a1 = icmp sgt i32 %count, 0
308  br i1 %a1, label %.lr.ph, label %._crit_edge
309
310.lr.ph:                                           ; preds = %0
311  %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
312  %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
313  %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
314  %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
315  br label %block4
316
317block4:                                       ; preds = %4, %.lr.ph
318  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
319  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
320  %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
321  %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
322  %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
323  %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
324  %b1 = load i32, i32* %a2, align 1
325  %b2 = load i32, i32* %a3, align 1
326  %b3 = load i32, i32* %a4, align 1
327  %b4 = load i32, i32* %a5, align 1
328  store i32 %b1, i32* %a7, align 1
329  store i32 %b2, i32* %a8, align 1
330  store i32 %b3, i32* %a9, align 1
331  store i32 %b4, i32* %a10, align 1
332  %c9 = add nsw i32 %i.02, 1
333  %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
334  %exitcond = icmp eq i32 %c9, %count
335  br i1 %exitcond, label %._crit_edge, label %block4
336
337._crit_edge:                                      ; preds = %4, %0
338  ret void
339}
340
341; Make sure that we merge the consecutive load/store sequence below and use a
342; word (16 bit) instead of a byte copy.
343; CHECK: MergeLoadStoreBaseIndexOffset
344; CHECK: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
345; CHECK: movw    [[REG]], (%{{.*}})
346define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
347  br label %1
348
349; <label>:1
350  %.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
351  %.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
352  %.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
353  %2 = getelementptr inbounds i64, i64* %.0, i64 1
354  %3 = load i64, i64* %.0, align 1
355  %4 = getelementptr inbounds i8, i8* %c, i64 %3
356  %5 = load i8, i8* %4, align 1
357  %6 = add i64 %3, 1
358  %7 = getelementptr inbounds i8, i8* %c, i64 %6
359  %8 = load i8, i8* %7, align 1
360  store i8 %5, i8* %.08, align 1
361  %9 = getelementptr inbounds i8, i8* %.08, i64 1
362  store i8 %8, i8* %9, align 1
363  %10 = getelementptr inbounds i8, i8* %.08, i64 2
364  %11 = add nsw i32 %.09, -1
365  %12 = icmp eq i32 %11, 0
366  br i1 %12, label %13, label %1
367
368; <label>:13
369  ret void
370}
371
372; Make sure that we merge the consecutive load/store sequence below and use a
373; word (16 bit) instead of a byte copy even if there are intermediate sign
374; extensions.
375; CHECK: MergeLoadStoreBaseIndexOffsetSext
376; CHECK: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
377; CHECK: movw    [[REG]], (%{{.*}})
378define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
379  br label %1
380
381; <label>:1
382  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
383  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
384  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
385  %2 = getelementptr inbounds i8, i8* %.0, i64 1
386  %3 = load i8, i8* %.0, align 1
387  %4 = sext i8 %3 to i64
388  %5 = getelementptr inbounds i8, i8* %c, i64 %4
389  %6 = load i8, i8* %5, align 1
390  %7 = add i64 %4, 1
391  %8 = getelementptr inbounds i8, i8* %c, i64 %7
392  %9 = load i8, i8* %8, align 1
393  store i8 %6, i8* %.08, align 1
394  %10 = getelementptr inbounds i8, i8* %.08, i64 1
395  store i8 %9, i8* %10, align 1
396  %11 = getelementptr inbounds i8, i8* %.08, i64 2
397  %12 = add nsw i32 %.09, -1
398  %13 = icmp eq i32 %12, 0
399  br i1 %13, label %14, label %1
400
401; <label>:14
402  ret void
403}
404
405; However, we can only merge ignore sign extensions when they are on all memory
406; computations;
407; CHECK: loadStoreBaseIndexOffsetSextNoSex
408; CHECK-NOT: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
409; CHECK-NOT: movw    [[REG]], (%{{.*}})
410define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
411  br label %1
412
413; <label>:1
414  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
415  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
416  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
417  %2 = getelementptr inbounds i8, i8* %.0, i64 1
418  %3 = load i8, i8* %.0, align 1
419  %4 = sext i8 %3 to i64
420  %5 = getelementptr inbounds i8, i8* %c, i64 %4
421  %6 = load i8, i8* %5, align 1
422  %7 = add i8 %3, 1
423  %wrap.4 = sext i8 %7 to i64
424  %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4
425  %9 = load i8, i8* %8, align 1
426  store i8 %6, i8* %.08, align 1
427  %10 = getelementptr inbounds i8, i8* %.08, i64 1
428  store i8 %9, i8* %10, align 1
429  %11 = getelementptr inbounds i8, i8* %.08, i64 2
430  %12 = add nsw i32 %.09, -1
431  %13 = icmp eq i32 %12, 0
432  br i1 %13, label %14, label %1
433
434; <label>:14
435  ret void
436}
437
438; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
439define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
440  %vecext0 = extractelement <8 x float> %v, i32 0
441  %vecext1 = extractelement <8 x float> %v, i32 1
442  %vecext2 = extractelement <8 x float> %v, i32 2
443  %vecext3 = extractelement <8 x float> %v, i32 3
444  %vecext4 = extractelement <8 x float> %v, i32 4
445  %vecext5 = extractelement <8 x float> %v, i32 5
446  %vecext6 = extractelement <8 x float> %v, i32 6
447  %vecext7 = extractelement <8 x float> %v, i32 7
448  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
449  %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2
450  %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3
451  %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4
452  %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5
453  %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6
454  %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7
455  store float %vecext0, float* %ptr, align 4
456  store float %vecext1, float* %arrayidx1, align 4
457  store float %vecext2, float* %arrayidx2, align 4
458  store float %vecext3, float* %arrayidx3, align 4
459  store float %vecext4, float* %arrayidx4, align 4
460  store float %vecext5, float* %arrayidx5, align 4
461  store float %vecext6, float* %arrayidx6, align 4
462  store float %vecext7, float* %arrayidx7, align 4
463  ret void
464
465; CHECK-LABEL: merge_vec_element_store
466; CHECK: vmovups
467; CHECK-NEXT: vzeroupper
468; CHECK-NEXT: retq
469}
470
471; This is a minimized test based on real code that was failing.
472; We could merge stores (and loads) like this...
473
474define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
475  %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
476  %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
477  %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
478  %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5
479
480  %a0 = load i64, i64* %idx0, align 8
481  store i64 %a0, i64* %idx4, align 8
482
483  %b = bitcast i64* %idx1 to <2 x i64>*
484  %v = load <2 x i64>, <2 x i64>* %b, align 8
485  %a1 = extractelement <2 x i64> %v, i32 0
486  store i64 %a1, i64* %idx5, align 8
487  ret void
488
489; CHECK-LABEL: merge_vec_element_and_scalar_load
490; CHECK:      movq	(%rdi), %rax
491; CHECK-NEXT: movq	%rax, 32(%rdi)
492; CHECK-NEXT: movq	8(%rdi), %rax
493; CHECK-NEXT: movq	%rax, 40(%rdi)
494; CHECK-NEXT: retq
495}
496