1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
3; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i1 @llvm.amdgcn.wqm.vote(i1)
9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
10declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg)
11
12; Show what the atomic optimization pass will do for raw buffers.
13
14define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
15; GFX7-LABEL: add_i32_constant:
16; GFX7:       ; %bb.0: ; %entry
17; GFX7-NEXT:    s_mov_b64 s[10:11], exec
18; GFX7-NEXT:    ; implicit-def: $vgpr0
19; GFX7-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
20; GFX7-NEXT:    s_cbranch_execz BB0_4
21; GFX7-NEXT:  ; %bb.1:
22; GFX7-NEXT:    s_mov_b64 s[12:13], exec
23; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s12, 0
24; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s13, v0
25; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
26; GFX7-NEXT:    ; implicit-def: $vgpr1
27; GFX7-NEXT:    s_and_saveexec_b64 s[10:11], vcc
28; GFX7-NEXT:    s_cbranch_execz BB0_3
29; GFX7-NEXT:  ; %bb.2:
30; GFX7-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
31; GFX7-NEXT:    v_mul_u32_u24_e64 v1, s12, 5
32; GFX7-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
33; GFX7-NEXT:  BB0_3:
34; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
35; GFX7-NEXT:    s_waitcnt vmcnt(0)
36; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
37; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
38; GFX7-NEXT:  BB0_4: ; %Flow
39; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
40; GFX7-NEXT:    s_wqm_b64 s[4:5], -1
41; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
42; GFX7-NEXT:    s_cbranch_vccnz BB0_6
43; GFX7-NEXT:  ; %bb.5: ; %if
44; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7-NEXT:  BB0_6: ; %UnifiedReturnBlock
46; GFX7-NEXT:    s_endpgm
47;
48; GFX8-LABEL: add_i32_constant:
49; GFX8:       ; %bb.0: ; %entry
50; GFX8-NEXT:    s_mov_b64 s[10:11], exec
51; GFX8-NEXT:    ; implicit-def: $vgpr0
52; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
53; GFX8-NEXT:    s_cbranch_execz BB0_4
54; GFX8-NEXT:  ; %bb.1:
55; GFX8-NEXT:    s_mov_b64 s[12:13], exec
56; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
57; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
58; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
59; GFX8-NEXT:    ; implicit-def: $vgpr1
60; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
61; GFX8-NEXT:    s_cbranch_execz BB0_3
62; GFX8-NEXT:  ; %bb.2:
63; GFX8-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
64; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s12, 5
65; GFX8-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
66; GFX8-NEXT:  BB0_3:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
68; GFX8-NEXT:    s_waitcnt vmcnt(0)
69; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
70; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
71; GFX8-NEXT:  BB0_4: ; %Flow
72; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
73; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
74; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
75; GFX8-NEXT:    s_cbranch_vccnz BB0_6
76; GFX8-NEXT:  ; %bb.5: ; %if
77; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
78; GFX8-NEXT:  BB0_6: ; %UnifiedReturnBlock
79; GFX8-NEXT:    s_endpgm
80;
81; GFX9-LABEL: add_i32_constant:
82; GFX9:       ; %bb.0: ; %entry
83; GFX9-NEXT:    s_mov_b64 s[10:11], exec
84; GFX9-NEXT:    ; implicit-def: $vgpr0
85; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
86; GFX9-NEXT:    s_cbranch_execz BB0_4
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_mov_b64 s[12:13], exec
89; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
90; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
91; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
92; GFX9-NEXT:    ; implicit-def: $vgpr1
93; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
94; GFX9-NEXT:    s_cbranch_execz BB0_3
95; GFX9-NEXT:  ; %bb.2:
96; GFX9-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
97; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s12, 5
98; GFX9-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
99; GFX9-NEXT:  BB0_3:
100; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
101; GFX9-NEXT:    s_waitcnt vmcnt(0)
102; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
103; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
104; GFX9-NEXT:  BB0_4: ; %Flow
105; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
106; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
107; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
108; GFX9-NEXT:    s_cbranch_vccnz BB0_6
109; GFX9-NEXT:  ; %bb.5: ; %if
110; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
111; GFX9-NEXT:  BB0_6: ; %UnifiedReturnBlock
112; GFX9-NEXT:    s_endpgm
113;
114; GFX1064-LABEL: add_i32_constant:
115; GFX1064:       ; %bb.0: ; %entry
116; GFX1064-NEXT:    s_mov_b64 s[10:11], exec
117; GFX1064-NEXT:    ; implicit-def: $vgpr0
118; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
119; GFX1064-NEXT:    s_cbranch_execz BB0_4
120; GFX1064-NEXT:  ; %bb.1:
121; GFX1064-NEXT:    s_mov_b64 s[12:13], exec
122; GFX1064-NEXT:    ; implicit-def: $vgpr1
123; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s12, 0
124; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s13, v0
125; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
126; GFX1064-NEXT:    s_and_saveexec_b64 s[28:29], vcc
127; GFX1064-NEXT:    s_cbranch_execz BB0_3
128; GFX1064-NEXT:  ; %bb.2:
129; GFX1064-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
130; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s12, 5
131; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
132; GFX1064-NEXT:  BB0_3:
133; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
134; GFX1064-NEXT:    s_or_b64 exec, exec, s[28:29]
135; GFX1064-NEXT:    s_waitcnt vmcnt(0)
136; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
137; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
138; GFX1064-NEXT:  BB0_4: ; %Flow
139; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
140; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
141; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
142; GFX1064-NEXT:    s_cbranch_vccnz BB0_6
143; GFX1064-NEXT:  ; %bb.5: ; %if
144; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
145; GFX1064-NEXT:  BB0_6: ; %UnifiedReturnBlock
146; GFX1064-NEXT:    s_endpgm
147;
148; GFX1032-LABEL: add_i32_constant:
149; GFX1032:       ; %bb.0: ; %entry
150; GFX1032-NEXT:    s_mov_b32 s9, exec_lo
151; GFX1032-NEXT:    ; implicit-def: $vgpr0
152; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
153; GFX1032-NEXT:    s_cbranch_execz BB0_4
154; GFX1032-NEXT:  ; %bb.1:
155; GFX1032-NEXT:    s_mov_b32 s10, exec_lo
156; GFX1032-NEXT:    ; implicit-def: $vgpr1
157; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s10, 0
158; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
159; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
160; GFX1032-NEXT:    s_cbranch_execz BB0_3
161; GFX1032-NEXT:  ; %bb.2:
162; GFX1032-NEXT:    s_bcnt1_i32_b32 s10, s10
163; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s10, 5
164; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
165; GFX1032-NEXT:  BB0_3:
166; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
167; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
168; GFX1032-NEXT:    s_waitcnt vmcnt(0)
169; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
170; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
171; GFX1032-NEXT:  BB0_4: ; %Flow
172; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
173; GFX1032-NEXT:    s_wqm_b32 s4, -1
174; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
175; GFX1032-NEXT:    s_cbranch_vccnz BB0_6
176; GFX1032-NEXT:  ; %bb.5: ; %if
177; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
178; GFX1032-NEXT:  BB0_6: ; %UnifiedReturnBlock
179; GFX1032-NEXT:    s_endpgm
180entry:
181  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
182  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
183  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
184  %cond = and i1 %cond1, %cond2
185  br i1 %cond, label %if, label %else
186if:
187  %bitcast = bitcast i32 %old to float
188  call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0)
189  ret void
190else:
191  ret void
192}
193
194define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
195; GFX7-LABEL: add_i32_varying:
196; GFX7:       ; %bb.0: ; %entry
197; GFX7-NEXT:    s_wqm_b64 s[8:9], -1
198; GFX7-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
199; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
200; GFX7-NEXT:    s_cbranch_vccnz BB1_2
201; GFX7-NEXT:  ; %bb.1: ; %if
202; GFX7-NEXT:    s_waitcnt vmcnt(0)
203; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
204; GFX7-NEXT:  BB1_2: ; %else
205; GFX7-NEXT:    s_endpgm
206;
207; GFX8-LABEL: add_i32_varying:
208; GFX8:       ; %bb.0: ; %entry
209; GFX8-NEXT:    s_mov_b64 s[10:11], exec
210; GFX8-NEXT:    v_mov_b32_e32 v2, v0
211; GFX8-NEXT:    ; implicit-def: $vgpr0
212; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
213; GFX8-NEXT:    s_cbranch_execz BB1_4
214; GFX8-NEXT:  ; %bb.1:
215; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
216; GFX8-NEXT:    v_mov_b32_e32 v1, 0
217; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
218; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
219; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
220; GFX8-NEXT:    s_not_b64 exec, exec
221; GFX8-NEXT:    v_mov_b32_e32 v2, 0
222; GFX8-NEXT:    s_not_b64 exec, exec
223; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
224; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
225; GFX8-NEXT:    s_nop 1
226; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
227; GFX8-NEXT:    s_nop 1
228; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
229; GFX8-NEXT:    s_nop 1
230; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
231; GFX8-NEXT:    s_nop 1
232; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
233; GFX8-NEXT:    s_nop 1
234; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
235; GFX8-NEXT:    v_readlane_b32 s12, v2, 63
236; GFX8-NEXT:    s_nop 0
237; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
238; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
239; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
240; GFX8-NEXT:    ; implicit-def: $vgpr0
241; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
242; GFX8-NEXT:    s_cbranch_execz BB1_3
243; GFX8-NEXT:  ; %bb.2:
244; GFX8-NEXT:    v_mov_b32_e32 v0, s12
245; GFX8-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
246; GFX8-NEXT:  BB1_3:
247; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
248; GFX8-NEXT:    s_waitcnt vmcnt(0)
249; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
250; GFX8-NEXT:    v_mov_b32_e32 v0, v1
251; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
252; GFX8-NEXT:  BB1_4: ; %Flow
253; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
254; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
255; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
256; GFX8-NEXT:    s_cbranch_vccnz BB1_6
257; GFX8-NEXT:  ; %bb.5: ; %if
258; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
259; GFX8-NEXT:  BB1_6: ; %UnifiedReturnBlock
260; GFX8-NEXT:    s_endpgm
261;
262; GFX9-LABEL: add_i32_varying:
263; GFX9:       ; %bb.0: ; %entry
264; GFX9-NEXT:    s_mov_b64 s[10:11], exec
265; GFX9-NEXT:    v_mov_b32_e32 v2, v0
266; GFX9-NEXT:    ; implicit-def: $vgpr0
267; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
268; GFX9-NEXT:    s_cbranch_execz BB1_4
269; GFX9-NEXT:  ; %bb.1:
270; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
271; GFX9-NEXT:    v_mov_b32_e32 v1, 0
272; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
273; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
274; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
275; GFX9-NEXT:    s_not_b64 exec, exec
276; GFX9-NEXT:    v_mov_b32_e32 v2, 0
277; GFX9-NEXT:    s_not_b64 exec, exec
278; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
279; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
280; GFX9-NEXT:    s_nop 1
281; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
282; GFX9-NEXT:    s_nop 1
283; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
284; GFX9-NEXT:    s_nop 1
285; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
286; GFX9-NEXT:    s_nop 1
287; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
288; GFX9-NEXT:    s_nop 1
289; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
290; GFX9-NEXT:    v_readlane_b32 s12, v2, 63
291; GFX9-NEXT:    s_nop 0
292; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
293; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
294; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
295; GFX9-NEXT:    ; implicit-def: $vgpr0
296; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
297; GFX9-NEXT:    s_cbranch_execz BB1_3
298; GFX9-NEXT:  ; %bb.2:
299; GFX9-NEXT:    v_mov_b32_e32 v0, s12
300; GFX9-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
301; GFX9-NEXT:  BB1_3:
302; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
303; GFX9-NEXT:    s_waitcnt vmcnt(0)
304; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
305; GFX9-NEXT:    v_mov_b32_e32 v0, v1
306; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
307; GFX9-NEXT:  BB1_4: ; %Flow
308; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
309; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
310; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
311; GFX9-NEXT:    s_cbranch_vccnz BB1_6
312; GFX9-NEXT:  ; %bb.5: ; %if
313; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
314; GFX9-NEXT:  BB1_6: ; %UnifiedReturnBlock
315; GFX9-NEXT:    s_endpgm
316;
317; GFX1064-LABEL: add_i32_varying:
318; GFX1064:       ; %bb.0: ; %entry
319; GFX1064-NEXT:    s_mov_b64 s[10:11], exec
320; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
321; GFX1064-NEXT:    ; implicit-def: $vgpr0
322; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
323; GFX1064-NEXT:    s_cbranch_execz BB1_4
324; GFX1064-NEXT:  ; %bb.1:
325; GFX1064-NEXT:    s_not_b64 exec, exec
326; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
327; GFX1064-NEXT:    s_not_b64 exec, exec
328; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
329; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
330; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
331; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
332; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
333; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
334; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
335; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
336; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
337; GFX1064-NEXT:    v_readlane_b32 s12, v1, 31
338; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
339; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
340; GFX1064-NEXT:    v_readlane_b32 s12, v1, 15
341; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
342; GFX1064-NEXT:    v_readlane_b32 s13, v1, 31
343; GFX1064-NEXT:    v_writelane_b32 v3, s12, 16
344; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
345; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
346; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
347; GFX1064-NEXT:    v_readlane_b32 s12, v1, 63
348; GFX1064-NEXT:    v_readlane_b32 s14, v1, 47
349; GFX1064-NEXT:    v_writelane_b32 v3, s13, 32
350; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
351; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
352; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
353; GFX1064-NEXT:    v_writelane_b32 v3, s14, 48
354; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
355; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
356; GFX1064-NEXT:    ; implicit-def: $vgpr0
357; GFX1064-NEXT:    s_and_saveexec_b64 s[28:29], vcc
358; GFX1064-NEXT:    s_cbranch_execz BB1_3
359; GFX1064-NEXT:  ; %bb.2:
360; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
361; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
362; GFX1064-NEXT:  BB1_3:
363; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
364; GFX1064-NEXT:    s_or_b64 exec, exec, s[28:29]
365; GFX1064-NEXT:    s_waitcnt vmcnt(0)
366; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
367; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
368; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s4, v0
369; GFX1064-NEXT:  BB1_4: ; %Flow
370; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
371; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
372; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
373; GFX1064-NEXT:    s_cbranch_vccnz BB1_6
374; GFX1064-NEXT:  ; %bb.5: ; %if
375; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
376; GFX1064-NEXT:  BB1_6: ; %UnifiedReturnBlock
377; GFX1064-NEXT:    s_endpgm
378;
379; GFX1032-LABEL: add_i32_varying:
380; GFX1032:       ; %bb.0: ; %entry
381; GFX1032-NEXT:    s_mov_b32 s9, exec_lo
382; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
383; GFX1032-NEXT:    ; implicit-def: $vgpr0
384; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
385; GFX1032-NEXT:    s_cbranch_execz BB1_4
386; GFX1032-NEXT:  ; %bb.1:
387; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
388; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
389; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
390; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
391; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
392; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
393; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
394; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
395; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
396; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
397; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
398; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
399; GFX1032-NEXT:    v_readlane_b32 s11, v1, 31
400; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
401; GFX1032-NEXT:    v_readlane_b32 s10, v1, 15
402; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
403; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
404; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
405; GFX1032-NEXT:    v_writelane_b32 v3, s10, 16
406; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
407; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
408; GFX1032-NEXT:    ; implicit-def: $vgpr0
409; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
410; GFX1032-NEXT:    s_cbranch_execz BB1_3
411; GFX1032-NEXT:  ; %bb.2:
412; GFX1032-NEXT:    v_mov_b32_e32 v0, s11
413; GFX1032-NEXT:    s_mov_b32 s10, s11
414; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
415; GFX1032-NEXT:  BB1_3:
416; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
417; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
418; GFX1032-NEXT:    s_waitcnt vmcnt(0)
419; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
420; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
421; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s4, v0
422; GFX1032-NEXT:  BB1_4: ; %Flow
423; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
424; GFX1032-NEXT:    s_wqm_b32 s4, -1
425; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
426; GFX1032-NEXT:    s_cbranch_vccnz BB1_6
427; GFX1032-NEXT:  ; %bb.5: ; %if
428; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
429; GFX1032-NEXT:  BB1_6: ; %UnifiedReturnBlock
430; GFX1032-NEXT:    s_endpgm
431entry:
432  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
433  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0)
434  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
435  %cond = and i1 %cond1, %cond2
436  br i1 %cond, label %if, label %else
437if:
438  %bitcast = bitcast i32 %old to float
439  call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0)
440  ret void
441else:
442  ret void
443}
444