1; Test that loads/stores don't move across a nacl.atomic.fence.all.
2; This should apply to both atomic and non-atomic loads/stores
3; (unlike the non-"all" variety of nacl.atomic.fence, which only
4; applies to atomic load/stores).
5;
6; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
7
8declare void @llvm.nacl.atomic.fence.all()
9declare i32 @llvm.nacl.atomic.load.i32(i32*, i32)
10declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
11
12@g32_a = internal global [4 x i8] zeroinitializer, align 4
13@g32_b = internal global [4 x i8] zeroinitializer, align 4
14@g32_c = internal global [4 x i8] zeroinitializer, align 4
15@g32_d = internal global [4 x i8] zeroinitializer, align 4
16
17define internal i32 @test_fused_load_sub_a() {
18entry:
19  %p_alloca = alloca i8, i32 4, align 4
20  %p_alloca_bc = bitcast i8* %p_alloca to i32*
21  store i32 999, i32* %p_alloca_bc, align 1
22
23  %p_a = bitcast [4 x i8]* @g32_a to i32*
24  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
25  %l_a2 = sub i32 1, %l_a
26  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
27
28  %p_b = bitcast [4 x i8]* @g32_b to i32*
29  %l_b = load i32, i32* %p_b, align 1
30  %l_b2 = sub i32 1, %l_b
31  store i32 %l_b2, i32* %p_b, align 1
32
33  %p_c = bitcast [4 x i8]* @g32_c to i32*
34  %l_c = load i32, i32* %p_c, align 1
35  %l_c2 = sub i32 1, %l_c
36  call void @llvm.nacl.atomic.fence.all()
37  store i32 %l_c2, i32* %p_c, align 1
38
39  ret i32 %l_c2
40}
41; CHECK-LABEL: test_fused_load_sub_a
42;    alloca store
43; CHECK: mov DWORD PTR {{.*}},0x3e7
44;    atomic store (w/ its own mfence)
45; The load + sub are optimized into one everywhere.
46; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}}
47; CHECK: mov {{(DWORD PTR)?}}
48; CHECK: mfence
49; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_b)|(.bss)}}
50; CHECK: mov {{(DWORD PTR)?}}
51; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}}
52; CHECK: mfence
53; CHECK: mov {{(DWORD PTR)?}}
54
55; Test with the fence moved up a bit.
56define internal i32 @test_fused_load_sub_b() {
57entry:
58  %p_alloca = alloca i8, i32 4, align 4
59  %p_alloca_bc = bitcast i8* %p_alloca to i32*
60  store i32 999, i32* %p_alloca_bc, align 1
61
62  %p_a = bitcast [4 x i8]* @g32_a to i32*
63  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
64  %l_a2 = sub i32 1, %l_a
65  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
66
67  %p_b = bitcast [4 x i8]* @g32_b to i32*
68  %l_b = load i32, i32* %p_b, align 1
69  %l_b2 = sub i32 1, %l_b
70  store i32 %l_b2, i32* %p_b, align 1
71
72  %p_c = bitcast [4 x i8]* @g32_c to i32*
73  call void @llvm.nacl.atomic.fence.all()
74  %l_c = load i32, i32* %p_c, align 1
75  %l_c2 = sub i32 1, %l_c
76  store i32 %l_c2, i32* %p_c, align 1
77
78  ret i32 %l_c2
79}
80; CHECK-LABEL: test_fused_load_sub_b
81;    alloca store
82; CHECK: mov DWORD PTR {{.*}},0x3e7
83;    atomic store (w/ its own mfence)
84; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}}
85; CHECK: mov {{(DWORD PTR)?}}
86; CHECK: mfence
87; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_b)|(.bss)}}
88; CHECK: mov {{(DWORD PTR)?}}
89; CHECK: mfence
90; Load + sub can still be optimized into one instruction
91; because it is not separated by a fence.
92; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}}
93; CHECK: mov {{(DWORD PTR)?}}
94
95; Test with the fence splitting a load/sub.
96define internal i32 @test_fused_load_sub_c() {
97entry:
98  %p_alloca = alloca i8, i32 4, align 4
99  %p_alloca_bc = bitcast i8* %p_alloca to i32*
100  store i32 999, i32* %p_alloca_bc, align 1
101
102  %p_a = bitcast [4 x i8]* @g32_a to i32*
103  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
104  %l_a2 = sub i32 1, %l_a
105  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
106
107  %p_b = bitcast [4 x i8]* @g32_b to i32*
108  %l_b = load i32, i32* %p_b, align 1
109  call void @llvm.nacl.atomic.fence.all()
110  %l_b2 = sub i32 1, %l_b
111  store i32 %l_b2, i32* %p_b, align 1
112
113  %p_c = bitcast [4 x i8]* @g32_c to i32*
114  %l_c = load i32, i32* %p_c, align 1
115  %l_c2 = sub i32 1, %l_c
116  store i32 %l_c2, i32* %p_c, align 1
117
118  ret i32 %l_c2
119}
120; CHECK-LABEL: test_fused_load_sub_c
121;    alloca store
122; CHECK: mov DWORD PTR {{.*}},0x3e7
123;    atomic store (w/ its own mfence)
124; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}}
125; CHECK: mov {{(DWORD PTR)?}}
126; CHECK: mfence
127; This load + sub are no longer optimized into one,
128; though perhaps it should be legal as long as
129; the load stays on the same side of the fence.
130; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_b)|(.bss)}}
131; CHECK: mfence
132; CHECK: mov {{.*}},0x1
133; CHECK: sub
134; CHECK: mov {{(DWORD PTR)?}}
135; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}}
136; CHECK: mov {{(DWORD PTR)?}}
137
138
139; Test where a bunch of i8 loads could have been fused into one
140; i32 load, but a fence blocks that.
141define internal i32 @could_have_fused_loads() {
142entry:
143  %ptr1 = bitcast [4 x i8]* @g32_d to i8*
144  %b1 = load i8, i8* %ptr1, align 1
145
146  %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32
147  %int_ptr_bump2 = add i32 %int_ptr2, 1
148  %ptr2 = inttoptr i32 %int_ptr_bump2 to i8*
149  %b2 = load i8, i8* %ptr2, align 1
150
151  %int_ptr_bump3 = add i32 %int_ptr2, 2
152  %ptr3 = inttoptr i32 %int_ptr_bump3 to i8*
153  %b3 = load i8, i8* %ptr3, align 1
154
155  call void @llvm.nacl.atomic.fence.all()
156
157  %int_ptr_bump4 = add i32 %int_ptr2, 3
158  %ptr4 = inttoptr i32 %int_ptr_bump4 to i8*
159  %b4 = load i8, i8* %ptr4, align 1
160
161  %b1.ext = zext i8 %b1 to i32
162  %b2.ext = zext i8 %b2 to i32
163  %b2.shift = shl i32 %b2.ext, 8
164  %b12 = or i32 %b1.ext, %b2.shift
165  %b3.ext = zext i8 %b3 to i32
166  %b3.shift = shl i32 %b3.ext, 16
167  %b123 = or i32 %b12, %b3.shift
168  %b4.ext = zext i8 %b4 to i32
169  %b4.shift = shl i32 %b4.ext, 24
170  %b1234 = or i32 %b123, %b4.shift
171  ret i32 %b1234
172}
173; CHECK-LABEL: could_have_fused_loads
174; CHECK: mov {{.*}},{{(BYTE PTR)?}}
175; CHECK: mov {{.*}},BYTE PTR
176; CHECK: mov {{.*}},BYTE PTR
177; CHECK: mfence
178; CHECK: mov {{.*}},BYTE PTR
179
180
181; Test where an identical load from two branches could have been hoisted
182; up, and then the code merged, but a fence prevents it.
183define internal i32 @could_have_hoisted_loads(i32 %x) {
184entry:
185  %ptr = bitcast [4 x i8]* @g32_d to i32*
186  %cmp = icmp eq i32 %x, 1
187  br i1 %cmp, label %branch1, label %branch2
188branch1:
189  %y = load i32, i32* %ptr, align 1
190  ret i32 %y
191branch2:
192  call void @llvm.nacl.atomic.fence.all()
193  %z = load i32, i32* %ptr, align 1
194  ret i32 %z
195}
196; CHECK-LABEL: could_have_hoisted_loads
197; CHECK: jne {{.*}}
198; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_d)|(.bss)}}
199; CHECK: ret
200; CHECK: mfence
201; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_d)|(.bss)}}
202; CHECK: ret
203