1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
3; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
4
5%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
6%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
7%struct.C = type { i8, i8, i8, i8, i32, i32, i32, i64 }
8
9; save 1,2,3 ... as one big integer.
10define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
11; CHECK-LABEL: merge_const_store:
12; CHECK:       # %bb.0:
13; CHECK-NEXT:    testl %edi, %edi
14; CHECK-NEXT:    jle .LBB0_3
15; CHECK-NEXT:  # %bb.1: # %.lr.ph.preheader
16; CHECK-NEXT:    movabsq $578437695752307201, %rax # imm = 0x807060504030201
17; CHECK-NEXT:    .p2align 4, 0x90
18; CHECK-NEXT:  .LBB0_2: # %.lr.ph
19; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
20; CHECK-NEXT:    movq %rax, (%rsi)
21; CHECK-NEXT:    addq $8, %rsi
22; CHECK-NEXT:    decl %edi
23; CHECK-NEXT:    jne .LBB0_2
24; CHECK-NEXT:  .LBB0_3: # %._crit_edge
25; CHECK-NEXT:    retq
26  %1 = icmp sgt i32 %count, 0
27  br i1 %1, label %.lr.ph, label %._crit_edge
28.lr.ph:
29  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
30  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
31  %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
32  store i8 1, i8* %2, align 1
33  %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
34  store i8 2, i8* %3, align 1
35  %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
36  store i8 3, i8* %4, align 1
37  %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
38  store i8 4, i8* %5, align 1
39  %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
40  store i8 5, i8* %6, align 1
41  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
42  store i8 6, i8* %7, align 1
43  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
44  store i8 7, i8* %8, align 1
45  %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
46  store i8 8, i8* %9, align 1
47  %10 = add nsw i32 %i.02, 1
48  %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
49  %exitcond = icmp eq i32 %10, %count
50  br i1 %exitcond, label %._crit_edge, label %.lr.ph
51._crit_edge:
52  ret void
53}
54
55; No vectors because we use noimplicitfloat
56define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
57; CHECK-LABEL: merge_const_store_no_vec:
58; CHECK:       # %bb.0:
59; CHECK-NEXT:    testl %edi, %edi
60; CHECK-NEXT:    jle .LBB1_2
61; CHECK-NEXT:    .p2align 4, 0x90
62; CHECK-NEXT:  .LBB1_1: # %.lr.ph
63; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
64; CHECK-NEXT:    movq $0, (%rsi)
65; CHECK-NEXT:    movq $0, 8(%rsi)
66; CHECK-NEXT:    movq $0, 16(%rsi)
67; CHECK-NEXT:    movq $0, 24(%rsi)
68; CHECK-NEXT:    addq $32, %rsi
69; CHECK-NEXT:    decl %edi
70; CHECK-NEXT:    jne .LBB1_1
71; CHECK-NEXT:  .LBB1_2: # %._crit_edge
72; CHECK-NEXT:    retq
73  %1 = icmp sgt i32 %count, 0
74  br i1 %1, label %.lr.ph, label %._crit_edge
75.lr.ph:
76  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
77  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
78  %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
79  store i32 0, i32* %2, align 4
80  %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
81  store i32 0, i32* %3, align 4
82  %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
83  store i32 0, i32* %4, align 4
84  %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
85  store i32 0, i32* %5, align 4
86  %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
87  store i32 0, i32* %6, align 4
88  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
89  store i32 0, i32* %7, align 4
90  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
91  store i32 0, i32* %8, align 4
92  %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
93  store i32 0, i32* %9, align 4
94  %10 = add nsw i32 %i.02, 1
95  %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
96  %exitcond = icmp eq i32 %10, %count
97  br i1 %exitcond, label %._crit_edge, label %.lr.ph
98._crit_edge:
99  ret void
100}
101
102; Move the constants using a single vector store.
103define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
104; CHECK-LABEL: merge_const_store_vec:
105; CHECK:       # %bb.0:
106; CHECK-NEXT:    testl %edi, %edi
107; CHECK-NEXT:    jle .LBB2_3
108; CHECK-NEXT:  # %bb.1: # %.lr.ph.preheader
109; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
110; CHECK-NEXT:    .p2align 4, 0x90
111; CHECK-NEXT:  .LBB2_2: # %.lr.ph
112; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
113; CHECK-NEXT:    vmovups %ymm0, (%rsi)
114; CHECK-NEXT:    addq $32, %rsi
115; CHECK-NEXT:    decl %edi
116; CHECK-NEXT:    jne .LBB2_2
117; CHECK-NEXT:  .LBB2_3: # %._crit_edge
118; CHECK-NEXT:    vzeroupper
119; CHECK-NEXT:    retq
120  %1 = icmp sgt i32 %count, 0
121  br i1 %1, label %.lr.ph, label %._crit_edge
122.lr.ph:
123  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
124  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
125  %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
126  store i32 0, i32* %2, align 4
127  %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
128  store i32 0, i32* %3, align 4
129  %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
130  store i32 0, i32* %4, align 4
131  %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
132  store i32 0, i32* %5, align 4
133  %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
134  store i32 0, i32* %6, align 4
135  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
136  store i32 0, i32* %7, align 4
137  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
138  store i32 0, i32* %8, align 4
139  %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
140  store i32 0, i32* %9, align 4
141  %10 = add nsw i32 %i.02, 1
142  %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
143  %exitcond = icmp eq i32 %10, %count
144  br i1 %exitcond, label %._crit_edge, label %.lr.ph
145._crit_edge:
146  ret void
147}
148
149; Move the first 4 constants as a single vector. Move the rest as scalars.
150define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
151; CHECK-LABEL: merge_nonconst_store:
152; CHECK:       # %bb.0:
153; CHECK-NEXT:    testl %edi, %edi
154; CHECK-NEXT:    jle .LBB3_2
155; CHECK-NEXT:    .p2align 4, 0x90
156; CHECK-NEXT:  .LBB3_1: # %.lr.ph
157; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
158; CHECK-NEXT:    movl $67305985, (%rdx) # imm = 0x4030201
159; CHECK-NEXT:    movb %sil, 4(%rdx)
160; CHECK-NEXT:    movw $1798, 5(%rdx) # imm = 0x706
161; CHECK-NEXT:    movb $8, 7(%rdx)
162; CHECK-NEXT:    addq $8, %rdx
163; CHECK-NEXT:    decl %edi
164; CHECK-NEXT:    jne .LBB3_1
165; CHECK-NEXT:  .LBB3_2: # %._crit_edge
166; CHECK-NEXT:    retq
167  %1 = icmp sgt i32 %count, 0
168  br i1 %1, label %.lr.ph, label %._crit_edge
169.lr.ph:
170  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
171  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
172  %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
173  store i8 1, i8* %2, align 1
174  %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
175  store i8 2, i8* %3, align 1
176  %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
177  store i8 3, i8* %4, align 1
178  %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
179  store i8 4, i8* %5, align 1
180  %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
181  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
182  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
183  store i8 6, i8* %7, align 1
184  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
185  store i8 7, i8* %8, align 1
186  %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
187  store i8 8, i8* %9, align 1
188  %10 = add nsw i32 %i.02, 1
189  %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
190  %exitcond = icmp eq i32 %10, %count
191  br i1 %exitcond, label %._crit_edge, label %.lr.ph
192._crit_edge:
193  ret void
194}
195
196define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
197; BWON-LABEL: merge_loads_i16:
198; BWON:       # %bb.0:
199; BWON-NEXT:    testl %edi, %edi
200; BWON-NEXT:    jle .LBB4_2
201; BWON-NEXT:    .p2align 4, 0x90
202; BWON-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
203; BWON-NEXT:    movzwl (%rsi), %eax
204; BWON-NEXT:    movw %ax, (%rdx)
205; BWON-NEXT:    addq $8, %rdx
206; BWON-NEXT:    decl %edi
207; BWON-NEXT:    jne .LBB4_1
208; BWON-NEXT:  .LBB4_2: # %._crit_edge
209; BWON-NEXT:    retq
210;
211; BWOFF-LABEL: merge_loads_i16:
212; BWOFF:       # %bb.0:
213; BWOFF-NEXT:    testl %edi, %edi
214; BWOFF-NEXT:    jle .LBB4_2
215; BWOFF-NEXT:    .p2align 4, 0x90
216; BWOFF-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
217; BWOFF-NEXT:    movw (%rsi), %ax
218; BWOFF-NEXT:    movw %ax, (%rdx)
219; BWOFF-NEXT:    addq $8, %rdx
220; BWOFF-NEXT:    decl %edi
221; BWOFF-NEXT:    jne .LBB4_1
222; BWOFF-NEXT:  .LBB4_2: # %._crit_edge
223; BWOFF-NEXT:    retq
224  %1 = icmp sgt i32 %count, 0
225  br i1 %1, label %.lr.ph, label %._crit_edge
226
227.lr.ph:                                           ; preds = %0
228  %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
229  %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
230  br label %4
231
232; <label>:4                                       ; preds = %4, %.lr.ph
233  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
234  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
235  %5 = load i8, i8* %2, align 1
236  %6 = load i8, i8* %3, align 1
237  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
238  store i8 %5, i8* %7, align 1
239  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
240  store i8 %6, i8* %8, align 1
241  %9 = add nsw i32 %i.02, 1
242  %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
243  %exitcond = icmp eq i32 %9, %count
244  br i1 %exitcond, label %._crit_edge, label %4
245
246._crit_edge:                                      ; preds = %4, %0
247  ret void
248}
249
250; The loads and the stores are interleaved. Can't merge them.
251define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
252; BWON-LABEL: no_merge_loads:
253; BWON:       # %bb.0:
254; BWON-NEXT:    testl %edi, %edi
255; BWON-NEXT:    jle .LBB5_2
256; BWON-NEXT:    .p2align 4, 0x90
257; BWON-NEXT:  .LBB5_1: # %a4
258; BWON-NEXT:    # =>This Inner Loop Header: Depth=1
259; BWON-NEXT:    movzbl (%rsi), %eax
260; BWON-NEXT:    movb %al, (%rdx)
261; BWON-NEXT:    movzbl 1(%rsi), %eax
262; BWON-NEXT:    movb %al, 1(%rdx)
263; BWON-NEXT:    addq $8, %rdx
264; BWON-NEXT:    decl %edi
265; BWON-NEXT:    jne .LBB5_1
266; BWON-NEXT:  .LBB5_2: # %._crit_edge
267; BWON-NEXT:    retq
268;
269; BWOFF-LABEL: no_merge_loads:
270; BWOFF:       # %bb.0:
271; BWOFF-NEXT:    testl %edi, %edi
272; BWOFF-NEXT:    jle .LBB5_2
273; BWOFF-NEXT:    .p2align 4, 0x90
274; BWOFF-NEXT:  .LBB5_1: # %a4
275; BWOFF-NEXT:    # =>This Inner Loop Header: Depth=1
276; BWOFF-NEXT:    movb (%rsi), %al
277; BWOFF-NEXT:    movb %al, (%rdx)
278; BWOFF-NEXT:    movb 1(%rsi), %al
279; BWOFF-NEXT:    movb %al, 1(%rdx)
280; BWOFF-NEXT:    addq $8, %rdx
281; BWOFF-NEXT:    decl %edi
282; BWOFF-NEXT:    jne .LBB5_1
283; BWOFF-NEXT:  .LBB5_2: # %._crit_edge
284; BWOFF-NEXT:    retq
285  %1 = icmp sgt i32 %count, 0
286  br i1 %1, label %.lr.ph, label %._crit_edge
287
288.lr.ph:                                           ; preds = %0
289  %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
290  %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
291  br label %a4
292
293a4:                                       ; preds = %4, %.lr.ph
294  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
295  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
296  %a5 = load i8, i8* %2, align 1
297  %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
298  store i8 %a5, i8* %a7, align 1
299  %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
300  %a6 = load i8, i8* %3, align 1
301  store i8 %a6, i8* %a8, align 1
302  %a9 = add nsw i32 %i.02, 1
303  %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
304  %exitcond = icmp eq i32 %a9, %count
305  br i1 %exitcond, label %._crit_edge, label %a4
306
307._crit_edge:                                      ; preds = %4, %0
308  ret void
309}
310
311define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
312; CHECK-LABEL: merge_loads_integer:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    testl %edi, %edi
315; CHECK-NEXT:    jle .LBB6_2
316; CHECK-NEXT:    .p2align 4, 0x90
317; CHECK-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
318; CHECK-NEXT:    movq (%rsi), %rax
319; CHECK-NEXT:    movq %rax, (%rdx)
320; CHECK-NEXT:    addq $32, %rdx
321; CHECK-NEXT:    decl %edi
322; CHECK-NEXT:    jne .LBB6_1
323; CHECK-NEXT:  .LBB6_2: # %._crit_edge
324; CHECK-NEXT:    retq
325  %1 = icmp sgt i32 %count, 0
326  br i1 %1, label %.lr.ph, label %._crit_edge
327
328.lr.ph:                                           ; preds = %0
329  %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
330  %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
331  br label %4
332
333; <label>:4                                       ; preds = %4, %.lr.ph
334  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
335  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
336  %5 = load i32, i32* %2
337  %6 = load i32, i32* %3
338  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
339  store i32 %5, i32* %7
340  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
341  store i32 %6, i32* %8
342  %9 = add nsw i32 %i.02, 1
343  %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
344  %exitcond = icmp eq i32 %9, %count
345  br i1 %exitcond, label %._crit_edge, label %4
346
347._crit_edge:                                      ; preds = %4, %0
348  ret void
349}
350
351define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
352; CHECK-LABEL: merge_loads_vector:
353; CHECK:       # %bb.0:
354; CHECK-NEXT:    testl %edi, %edi
355; CHECK-NEXT:    jle .LBB7_2
356; CHECK-NEXT:    .p2align 4, 0x90
357; CHECK-NEXT:  .LBB7_1: # %block4
358; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
359; CHECK-NEXT:    vmovups (%rsi), %xmm0
360; CHECK-NEXT:    vmovups %xmm0, (%rdx)
361; CHECK-NEXT:    addq $32, %rdx
362; CHECK-NEXT:    decl %edi
363; CHECK-NEXT:    jne .LBB7_1
364; CHECK-NEXT:  .LBB7_2: # %._crit_edge
365; CHECK-NEXT:    retq
366  %a1 = icmp sgt i32 %count, 0
367  br i1 %a1, label %.lr.ph, label %._crit_edge
368
369.lr.ph:                                           ; preds = %0
370  %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
371  %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
372  %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
373  %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
374  br label %block4
375
376block4:                                       ; preds = %4, %.lr.ph
377  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
378  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
379  %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
380  %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
381  %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
382  %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
383  %b1 = load i32, i32* %a2
384  %b2 = load i32, i32* %a3
385  %b3 = load i32, i32* %a4
386  %b4 = load i32, i32* %a5
387  store i32 %b1, i32* %a7
388  store i32 %b2, i32* %a8
389  store i32 %b3, i32* %a9
390  store i32 %b4, i32* %a10
391  %c9 = add nsw i32 %i.02, 1
392  %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
393  %exitcond = icmp eq i32 %c9, %count
394  br i1 %exitcond, label %._crit_edge, label %block4
395
396._crit_edge:                                      ; preds = %4, %0
397  ret void
398}
399
400; On x86, even unaligned copies can be merged to vector ops.
401define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
402; CHECK-LABEL: merge_loads_no_align:
403; CHECK:       # %bb.0:
404; CHECK-NEXT:    testl %edi, %edi
405; CHECK-NEXT:    jle .LBB8_2
406; CHECK-NEXT:    .p2align 4, 0x90
407; CHECK-NEXT:  .LBB8_1: # %block4
408; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
409; CHECK-NEXT:    vmovups (%rsi), %xmm0
410; CHECK-NEXT:    vmovups %xmm0, (%rdx)
411; CHECK-NEXT:    addq $32, %rdx
412; CHECK-NEXT:    decl %edi
413; CHECK-NEXT:    jne .LBB8_1
414; CHECK-NEXT:  .LBB8_2: # %._crit_edge
415; CHECK-NEXT:    retq
416  %a1 = icmp sgt i32 %count, 0
417  br i1 %a1, label %.lr.ph, label %._crit_edge
418
419.lr.ph:                                           ; preds = %0
420  %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
421  %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
422  %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
423  %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
424  br label %block4
425
426block4:                                       ; preds = %4, %.lr.ph
427  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
428  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
429  %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
430  %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
431  %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
432  %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
433  %b1 = load i32, i32* %a2, align 1
434  %b2 = load i32, i32* %a3, align 1
435  %b3 = load i32, i32* %a4, align 1
436  %b4 = load i32, i32* %a5, align 1
437  store i32 %b1, i32* %a7, align 1
438  store i32 %b2, i32* %a8, align 1
439  store i32 %b3, i32* %a9, align 1
440  store i32 %b4, i32* %a10, align 1
441  %c9 = add nsw i32 %i.02, 1
442  %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
443  %exitcond = icmp eq i32 %c9, %count
444  br i1 %exitcond, label %._crit_edge, label %block4
445
446._crit_edge:                                      ; preds = %4, %0
447  ret void
448}
449
450; Make sure that we merge the consecutive load/store sequence below and use a
451; word (16 bit) instead of a byte copy.
452define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
453; BWON-LABEL: MergeLoadStoreBaseIndexOffset:
454; BWON:       # %bb.0:
455; BWON-NEXT:    movl %ecx, %r8d
456; BWON-NEXT:    xorl %ecx, %ecx
457; BWON-NEXT:    .p2align 4, 0x90
458; BWON-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
459; BWON-NEXT:    movq (%rdi,%rcx,8), %rax
460; BWON-NEXT:    movzwl (%rdx,%rax), %eax
461; BWON-NEXT:    movw %ax, (%rsi,%rcx,2)
462; BWON-NEXT:    incq %rcx
463; BWON-NEXT:    cmpl %ecx, %r8d
464; BWON-NEXT:    jne .LBB9_1
465; BWON-NEXT:  # %bb.2:
466; BWON-NEXT:    retq
467;
468; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
469; BWOFF:       # %bb.0:
470; BWOFF-NEXT:    movl %ecx, %r8d
471; BWOFF-NEXT:    xorl %ecx, %ecx
472; BWOFF-NEXT:    .p2align 4, 0x90
473; BWOFF-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
474; BWOFF-NEXT:    movq (%rdi,%rcx,8), %rax
475; BWOFF-NEXT:    movw (%rdx,%rax), %ax
476; BWOFF-NEXT:    movw %ax, (%rsi,%rcx,2)
477; BWOFF-NEXT:    incq %rcx
478; BWOFF-NEXT:    cmpl %ecx, %r8d
479; BWOFF-NEXT:    jne .LBB9_1
480; BWOFF-NEXT:  # %bb.2:
481; BWOFF-NEXT:    retq
482  br label %1
483
484; <label>:1
485  %.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
486  %.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
487  %.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
488  %2 = getelementptr inbounds i64, i64* %.0, i64 1
489  %3 = load i64, i64* %.0, align 1
490  %4 = getelementptr inbounds i8, i8* %c, i64 %3
491  %5 = load i8, i8* %4, align 1
492  %6 = add i64 %3, 1
493  %7 = getelementptr inbounds i8, i8* %c, i64 %6
494  %8 = load i8, i8* %7, align 1
495  store i8 %5, i8* %.08, align 1
496  %9 = getelementptr inbounds i8, i8* %.08, i64 1
497  store i8 %8, i8* %9, align 1
498  %10 = getelementptr inbounds i8, i8* %.08, i64 2
499  %11 = add nsw i32 %.09, -1
500  %12 = icmp eq i32 %11, 0
501  br i1 %12, label %13, label %1
502
503; <label>:13
504  ret void
505}
506
507; Make sure that we merge the consecutive load/store sequence below and use a
508; word (16 bit) instead of a byte copy for complicated address calculation.
509define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
510; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
511; BWON:       # %bb.0:
512; BWON-NEXT:    xorl %r8d, %r8d
513; BWON-NEXT:    .p2align 4, 0x90
514; BWON-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
515; BWON-NEXT:    movsbq (%rsi), %rax
516; BWON-NEXT:    movzwl (%rdx,%rax), %eax
517; BWON-NEXT:    movw %ax, (%rdi,%r8)
518; BWON-NEXT:    incq %rsi
519; BWON-NEXT:    addq $2, %r8
520; BWON-NEXT:    cmpq %rcx, %r8
521; BWON-NEXT:    jl .LBB10_1
522; BWON-NEXT:  # %bb.2:
523; BWON-NEXT:    retq
524;
525; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
526; BWOFF:       # %bb.0:
527; BWOFF-NEXT:    xorl %r8d, %r8d
528; BWOFF-NEXT:    .p2align 4, 0x90
529; BWOFF-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
530; BWOFF-NEXT:    movsbq (%rsi), %rax
531; BWOFF-NEXT:    movw (%rdx,%rax), %ax
532; BWOFF-NEXT:    movw %ax, (%rdi,%r8)
533; BWOFF-NEXT:    incq %rsi
534; BWOFF-NEXT:    addq $2, %r8
535; BWOFF-NEXT:    cmpq %rcx, %r8
536; BWOFF-NEXT:    jl .LBB10_1
537; BWOFF-NEXT:  # %bb.2:
538; BWOFF-NEXT:    retq
539  br label %1
540
541; <label>:1
542  %.09 = phi i64 [ 0, %0 ], [ %13, %1 ]
543  %.08 = phi i8* [ %b, %0 ], [ %12, %1 ]
544  %2 = load i8, i8* %.08, align 1
545  %3 = sext i8 %2 to i64
546  %4 = getelementptr inbounds i8, i8* %c, i64 %3
547  %5 = load i8, i8* %4, align 1
548  %6 = add nsw i64 %3, 1
549  %7 = getelementptr inbounds i8, i8* %c, i64 %6
550  %8 = load i8, i8* %7, align 1
551  %9 = getelementptr inbounds i8, i8* %a, i64 %.09
552  store i8 %5, i8* %9, align 1
553  %10 = or i64 %.09, 1
554  %11 = getelementptr inbounds i8, i8* %a, i64 %10
555  store i8 %8, i8* %11, align 1
556  %12 = getelementptr inbounds i8, i8* %.08, i64 1
557  %13 = add nuw nsw i64 %.09, 2
558  %14 = icmp slt i64 %13, %n
559  br i1 %14, label %1, label %15
560
561; <label>:15
562  ret void
563}
564
565; Make sure that we merge the consecutive load/store sequence below and use a
566; word (16 bit) instead of a byte copy even if there are intermediate sign
567; extensions.
568define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
569; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
570; BWON:       # %bb.0:
571; BWON-NEXT:    movl %ecx, %r8d
572; BWON-NEXT:    xorl %ecx, %ecx
573; BWON-NEXT:    .p2align 4, 0x90
574; BWON-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
575; BWON-NEXT:    movsbq (%rdi,%rcx), %rax
576; BWON-NEXT:    movzwl (%rdx,%rax), %eax
577; BWON-NEXT:    movw %ax, (%rsi,%rcx,2)
578; BWON-NEXT:    incq %rcx
579; BWON-NEXT:    cmpl %ecx, %r8d
580; BWON-NEXT:    jne .LBB11_1
581; BWON-NEXT:  # %bb.2:
582; BWON-NEXT:    retq
583;
584; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
585; BWOFF:       # %bb.0:
586; BWOFF-NEXT:    movl %ecx, %r8d
587; BWOFF-NEXT:    xorl %ecx, %ecx
588; BWOFF-NEXT:    .p2align 4, 0x90
589; BWOFF-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
590; BWOFF-NEXT:    movsbq (%rdi,%rcx), %rax
591; BWOFF-NEXT:    movw (%rdx,%rax), %ax
592; BWOFF-NEXT:    movw %ax, (%rsi,%rcx,2)
593; BWOFF-NEXT:    incq %rcx
594; BWOFF-NEXT:    cmpl %ecx, %r8d
595; BWOFF-NEXT:    jne .LBB11_1
596; BWOFF-NEXT:  # %bb.2:
597; BWOFF-NEXT:    retq
598  br label %1
599
600; <label>:1
601  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
602  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
603  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
604  %2 = getelementptr inbounds i8, i8* %.0, i64 1
605  %3 = load i8, i8* %.0, align 1
606  %4 = sext i8 %3 to i64
607  %5 = getelementptr inbounds i8, i8* %c, i64 %4
608  %6 = load i8, i8* %5, align 1
609  %7 = add i64 %4, 1
610  %8 = getelementptr inbounds i8, i8* %c, i64 %7
611  %9 = load i8, i8* %8, align 1
612  store i8 %6, i8* %.08, align 1
613  %10 = getelementptr inbounds i8, i8* %.08, i64 1
614  store i8 %9, i8* %10, align 1
615  %11 = getelementptr inbounds i8, i8* %.08, i64 2
616  %12 = add nsw i32 %.09, -1
617  %13 = icmp eq i32 %12, 0
618  br i1 %13, label %14, label %1
619
620; <label>:14
621  ret void
622}
623
624; However, we can only merge ignore sign extensions when they are on all memory
625; computations;
626define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
627; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
628; BWON:       # %bb.0:
629; BWON-NEXT:    movl %ecx, %r8d
630; BWON-NEXT:    xorl %ecx, %ecx
631; BWON-NEXT:    .p2align 4, 0x90
632; BWON-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
633; BWON-NEXT:    movsbq (%rdi,%rcx), %rax
634; BWON-NEXT:    movzbl (%rdx,%rax), %r9d
635; BWON-NEXT:    incl %eax
636; BWON-NEXT:    movsbq %al, %rax
637; BWON-NEXT:    movzbl (%rdx,%rax), %eax
638; BWON-NEXT:    movb %r9b, (%rsi,%rcx,2)
639; BWON-NEXT:    movb %al, 1(%rsi,%rcx,2)
640; BWON-NEXT:    incq %rcx
641; BWON-NEXT:    cmpl %ecx, %r8d
642; BWON-NEXT:    jne .LBB12_1
643; BWON-NEXT:  # %bb.2:
644; BWON-NEXT:    retq
645;
646; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
647; BWOFF:       # %bb.0:
648; BWOFF-NEXT:    movl %ecx, %r8d
649; BWOFF-NEXT:    xorl %ecx, %ecx
650; BWOFF-NEXT:    .p2align 4, 0x90
651; BWOFF-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
652; BWOFF-NEXT:    movsbq (%rdi,%rcx), %rax
653; BWOFF-NEXT:    movb (%rdx,%rax), %r9b
654; BWOFF-NEXT:    incl %eax
655; BWOFF-NEXT:    movsbq %al, %rax
656; BWOFF-NEXT:    movb (%rdx,%rax), %al
657; BWOFF-NEXT:    movb %r9b, (%rsi,%rcx,2)
658; BWOFF-NEXT:    movb %al, 1(%rsi,%rcx,2)
659; BWOFF-NEXT:    incq %rcx
660; BWOFF-NEXT:    cmpl %ecx, %r8d
661; BWOFF-NEXT:    jne .LBB12_1
662; BWOFF-NEXT:  # %bb.2:
663; BWOFF-NEXT:    retq
664  br label %1
665
666; <label>:1
667  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
668  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
669  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
670  %2 = getelementptr inbounds i8, i8* %.0, i64 1
671  %3 = load i8, i8* %.0, align 1
672  %4 = sext i8 %3 to i64
673  %5 = getelementptr inbounds i8, i8* %c, i64 %4
674  %6 = load i8, i8* %5, align 1
675  %7 = add i8 %3, 1
676  %wrap.4 = sext i8 %7 to i64
677  %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4
678  %9 = load i8, i8* %8, align 1
679  store i8 %6, i8* %.08, align 1
680  %10 = getelementptr inbounds i8, i8* %.08, i64 1
681  store i8 %9, i8* %10, align 1
682  %11 = getelementptr inbounds i8, i8* %.08, i64 2
683  %12 = add nsw i32 %.09, -1
684  %13 = icmp eq i32 %12, 0
685  br i1 %13, label %14, label %1
686
687; <label>:14
688  ret void
689}
690
691; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
692define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
693; CHECK-LABEL: merge_vec_element_store:
694; CHECK:       # %bb.0:
695; CHECK-NEXT:    vmovups %ymm0, (%rdi)
696; CHECK-NEXT:    vzeroupper
697; CHECK-NEXT:    retq
698  %vecext0 = extractelement <8 x float> %v, i32 0
699  %vecext1 = extractelement <8 x float> %v, i32 1
700  %vecext2 = extractelement <8 x float> %v, i32 2
701  %vecext3 = extractelement <8 x float> %v, i32 3
702  %vecext4 = extractelement <8 x float> %v, i32 4
703  %vecext5 = extractelement <8 x float> %v, i32 5
704  %vecext6 = extractelement <8 x float> %v, i32 6
705  %vecext7 = extractelement <8 x float> %v, i32 7
706  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
707  %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2
708  %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3
709  %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4
710  %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5
711  %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6
712  %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7
713  store float %vecext0, float* %ptr, align 4
714  store float %vecext1, float* %arrayidx1, align 4
715  store float %vecext2, float* %arrayidx2, align 4
716  store float %vecext3, float* %arrayidx3, align 4
717  store float %vecext4, float* %arrayidx4, align 4
718  store float %vecext5, float* %arrayidx5, align 4
719  store float %vecext6, float* %arrayidx6, align 4
720  store float %vecext7, float* %arrayidx7, align 4
721  ret void
722
723}
724
725; PR21711 - Merge vector stores into wider vector stores.
726; These should be merged into 32-byte stores.
727define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
728; CHECK-LABEL: merge_vec_extract_stores:
729; CHECK:       # %bb.0:
730; CHECK-NEXT:    vmovups %ymm0, 48(%rdi)
731; CHECK-NEXT:    vmovups %ymm1, 80(%rdi)
732; CHECK-NEXT:    vzeroupper
733; CHECK-NEXT:    retq
734  %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
735  %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
736  %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
737  %idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6
738  %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
739  %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
740  %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
741  %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
742  store <4 x float> %shuffle0, <4 x float>* %idx0, align 16
743  store <4 x float> %shuffle1, <4 x float>* %idx1, align 16
744  store <4 x float> %shuffle2, <4 x float>* %idx2, align 16
745  store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
746  ret void
747
748}
749
750; Merging vector stores when sourced from vector loads.
751define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
752; CHECK-LABEL: merge_vec_stores_from_loads:
753; CHECK:       # %bb.0:
754; CHECK-NEXT:    vmovups (%rdi), %ymm0
755; CHECK-NEXT:    vmovups %ymm0, (%rsi)
756; CHECK-NEXT:    vzeroupper
757; CHECK-NEXT:    retq
758  %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
759  %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
760  %v0 = load <4 x float>, <4 x float>* %load_idx0
761  %v1 = load <4 x float>, <4 x float>* %load_idx1
762  %store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0
763  %store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
764  store <4 x float> %v0, <4 x float>* %store_idx0, align 16
765  store <4 x float> %v1, <4 x float>* %store_idx1, align 16
766  ret void
767
768}
769
770; Merging vector stores when sourced from a constant vector is not currently handled.
771define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
772; CHECK-LABEL: merge_vec_stores_of_constants:
773; CHECK:       # %bb.0:
774; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
775; CHECK-NEXT:    vmovaps %xmm0, 48(%rdi)
776; CHECK-NEXT:    vmovaps %xmm0, 64(%rdi)
777; CHECK-NEXT:    retq
778  %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
779  %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
780  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
781  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
782  ret void
783
784}
785
786; This is a minimized test based on real code that was failing.
787; This should now be merged.
788define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
789; CHECK-LABEL: merge_vec_element_and_scalar_load:
790; CHECK:       # %bb.0:
791; CHECK-NEXT:    vmovups (%rdi), %xmm0
792; CHECK-NEXT:    vmovups %xmm0, 32(%rdi)
793; CHECK-NEXT:    retq
794  %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
795  %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
796  %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
797  %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5
798
799  %a0 = load i64, i64* %idx0, align 8
800  store i64 %a0, i64* %idx4, align 8
801
802  %b = bitcast i64* %idx1 to <2 x i64>*
803  %v = load <2 x i64>, <2 x i64>* %b, align 8
804  %a1 = extractelement <2 x i64> %v, i32 0
805  store i64 %a1, i64* %idx5, align 8
806  ret void
807
808}
809
810; Don't let a non-consecutive store thwart merging of the last two.
811define void @almost_consecutive_stores(i8* %p) {
812; CHECK-LABEL: almost_consecutive_stores:
813; CHECK:       # %bb.0:
814; CHECK-NEXT:    movb $0, (%rdi)
815; CHECK-NEXT:    movb $1, 42(%rdi)
816; CHECK-NEXT:    movw $770, 2(%rdi) # imm = 0x302
817; CHECK-NEXT:    retq
818  store i8 0, i8* %p
819  %p1 = getelementptr i8, i8* %p, i64 42
820  store i8 1, i8* %p1
821  %p2 = getelementptr i8, i8* %p, i64 2
822  store i8 2, i8* %p2
823  %p3 = getelementptr i8, i8* %p, i64 3
824  store i8 3, i8* %p3
825  ret void
826}
827
828; We should be able to merge these.
829define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
830; CHECK-LABEL: merge_bitcast:
831; CHECK:       # %bb.0:
832; CHECK-NEXT:    vmovups %xmm0, (%rdi)
833; CHECK-NEXT:    retq
834  %fv = bitcast <4 x i32> %v to <4 x float>
835  %vecext1 = extractelement <4 x i32> %v, i32 1
836  %vecext2 = extractelement <4 x i32> %v, i32 2
837  %vecext3 = extractelement <4 x i32> %v, i32 3
838  %f0 = extractelement <4 x float> %fv, i32 0
839  %f1 = bitcast i32 %vecext1 to float
840  %f2 = bitcast i32 %vecext2 to float
841  %f3 = bitcast i32 %vecext3 to float
842  %idx0 = getelementptr inbounds float, float* %ptr, i64 0
843  %idx1 = getelementptr inbounds float, float* %ptr, i64 1
844  %idx2 = getelementptr inbounds float, float* %ptr, i64 2
845  %idx3 = getelementptr inbounds float, float* %ptr, i64 3
846  store float %f0, float* %idx0, align 4
847  store float %f1, float* %idx1, align 4
848  store float %f2, float* %idx2, align 4
849  store float %f3, float* %idx3, align 4
850  ret void
851}
852
853; same as @merge_const_store with heterogeneous types.
854define void @merge_const_store_heterogeneous(i32 %count, %struct.C* nocapture %p) nounwind uwtable noinline ssp {
855; CHECK-LABEL: merge_const_store_heterogeneous:
856; CHECK:       # %bb.0:
857; CHECK-NEXT:    testl %edi, %edi
858; CHECK-NEXT:    jle .LBB20_3
859; CHECK-NEXT:  # %bb.1: # %.lr.ph.preheader
860; CHECK-NEXT:    movabsq $578437695752307201, %rax # imm = 0x807060504030201
861; CHECK-NEXT:    .p2align 4, 0x90
862; CHECK-NEXT:  .LBB20_2: # %.lr.ph
863; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
864; CHECK-NEXT:    movq %rax, (%rsi)
865; CHECK-NEXT:    addq $24, %rsi
866; CHECK-NEXT:    decl %edi
867; CHECK-NEXT:    jne .LBB20_2
868; CHECK-NEXT:  .LBB20_3: # %._crit_edge
869; CHECK-NEXT:    retq
870  %1 = icmp sgt i32 %count, 0
871  br i1 %1, label %.lr.ph, label %._crit_edge
872.lr.ph:
873  %i.02 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
874  %.01 = phi %struct.C* [ %8, %.lr.ph ], [ %p, %0 ]
875  %2 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 0
876  store i8 1, i8* %2, align 1
877  %3 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 1
878  store i8 2, i8* %3, align 1
879  %4 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 2
880  store i8 3, i8* %4, align 1
881  %5 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 3
882  store i8 4, i8* %5, align 1
883  %6 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 4
884  store i32 134678021, i32* %6, align 1
885  %7 = add nsw i32 %i.02, 1
886  %8 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 1
887  %exitcond = icmp eq i32 %7, %count
888  br i1 %exitcond, label %._crit_edge, label %.lr.ph
889._crit_edge:
890  ret void
891}
892
893; Merging heterogeneous integer types.
894define void @merge_heterogeneous(%struct.C* nocapture %p, %struct.C* nocapture %q) {
895; CHECK-LABEL: merge_heterogeneous:
896; CHECK:       # %bb.0:
897; CHECK-NEXT:    movq (%rdi), %rax
898; CHECK-NEXT:    movq %rax, (%rsi)
899; CHECK-NEXT:    retq
900  %s0 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 0
901  %s1 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 1
902  %s2 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 2
903  %s3 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 3
904  %s4 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 4
905  %d0 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 0
906  %d1 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 1
907  %d2 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 2
908  %d3 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 3
909  %d4 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 4
910  %v0 = load i8, i8* %s0, align 1
911  %v1 = load i8, i8* %s1, align 1
912  %v2 = load i8, i8* %s2, align 1
913  %v3 = load i8, i8* %s3, align 1
914  %v4 = load i32, i32* %s4, align 1
915  store i8 %v0, i8* %d0, align 1
916  store i8 %v1, i8* %d1, align 1
917  store i8 %v2, i8* %d2, align 1
918  store i8 %v3, i8* %d3, align 1
919  store i32 %v4, i32* %d4, align 4
920  ret void
921}
922
923