1# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-insert-waitcnts  %s -o - | FileCheck -check-prefixes=CHECK,GFX89 %s
2# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts  %s -o - | FileCheck -check-prefixes=CHECK,GFX89 %s
3
4--- |
5  define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
6                                 <4 x i32> addrspace(1)* %global16,
7                                 i32* %flat4,
8                                 <4 x i32>* %flat16) {
9    ret void
10  }
11
12  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
13    ret void
14  }
15
16  define amdgpu_kernel void @single_branch_successor_not_next_block() {
17    ret void
18  }
19
20  define amdgpu_kernel void @preexisting_waitcnt() {
21    ret void
22  }
23
24  define amdgpu_kernel void @bundle_no_waitcnt() {
25    ret void
26  }
27
28  define amdgpu_kernel void @preexisting_waitcnt_in_bundle() {
29    ret void
30  }
31
32  define amdgpu_kernel void @insert_in_bundle() {
33    ret void
34  }
35
36  define amdgpu_kernel void @exit_bundle() {
37    ret void
38  }
39
40  define amdgpu_kernel void @cross_bundle() {
41    ret void
42  }
43
44  define amdgpu_kernel void @subregs16bit() {
45    ret void
46  }
47...
48---
49
50# CHECK-LABEL: name: flat_zero_waitcnt
51
52# CHECK-LABEL: bb.0:
53# CHECK: FLAT_LOAD_DWORD
54# CHECK: FLAT_LOAD_DWORDX4
55# Global loads will return in order so we should:
56# s_waitcnt vmcnt(1)
57# CHECK-NEXT: S_WAITCNT 3953
58
59# CHECK-LABEL: bb.1:
60# CHECK: FLAT_LOAD_DWORD
61# s_waitcnt vmcnt(0)
62# GFX89: S_WAITCNT 3952
63# CHECK: FLAT_LOAD_DWORDX4
64
65# CHECK-LABEL: bb.2:
66# CHECK: FLAT_LOAD_DWORD
67# s_waitcnt vmcnt(0)
68# GFX89: S_WAITCNT 3952
69# CHECK: FLAT_LOAD_DWORDX4
70
71# CHECK-LABEL: bb.3:
72# s_waitcnt vmcnt(0)
73# GFX89: S_WAITCNT 3952
74# CHECK: FLAT_LOAD_DWORD
75# CHECK: FLAT_LOAD_DWORD
76# s_waitcnt vmcnt(0) lgkmcnt(0)
77# GFX89: S_WAITCNT 112
78
79# CHECK-LABEL: bb.4:
80# GFX89-NOT: S_WAITCNT
81# CHECK: FLAT_LOAD_DWORD
82# s_waitcnt vmcnt(0) lgkmcnt(0)
83# GFX89: S_WAITCNT 112
84
85name: flat_zero_waitcnt
86
87body: |
88  bb.0:
89    successors: %bb.1
90    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
91    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
92    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
93    S_BRANCH %bb.1
94
95  bb.1:
96    successors: %bb.2
97    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
98    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
99    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
100    S_BRANCH %bb.2
101
102  bb.2:
103    successors: %bb.3
104    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
105    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16)
106    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
107    S_BRANCH %bb.3
108
109  bb.3:
110    successors: %bb.4
111    $vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
112    $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
113    $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec
114    S_BRANCH %bb.4
115
116  bb.4:
117    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
118    $vgpr0 = V_MOV_B32_e32 $vgpr5, implicit $exec
119    S_ENDPGM 0
120...
121---
122# There is only a single fallthrough successor block, so there's no
123# need to wait immediately.
124
125# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait
126# CHECK:   $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2
127# CHECK-NOT: S_WAITCNT
128
129# CHECK: bb.1:
130# CHECK-NEXT: V_LSHLREV_B64
131# CHECK-NEXT: S_WAITCNT 112
132# CHECK-NEXT: FLAT_STORE_DWORD
133name: single_fallthrough_successor_no_end_block_wait
134
135body: |
136  bb.0:
137    successors: %bb.1
138    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
139
140  bb.1:
141    $vgpr3_vgpr4 = V_LSHLREV_B64 4, $vgpr7_vgpr8, implicit $exec
142    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
143    S_ENDPGM 0
144...
145---
146# The block has a single predecessor with a single successor, but it
147# is not the next block so it's non-obvious that the wait is not needed.
148
149
150# CHECK-LABEL: name: single_branch_successor_not_next_block
151
152# CHECK: bb.1
153# CHECK-NEXT: FLAT_STORE_DWORD
154# CHECK-NEXT: S_ENDPGM 0
155
156# CHECK: bb.2:
157# CHECK-NEXT: V_LSHLREV_B64
158# CHECK-NEXT: S_WAITCNT 112
159# CHECK-NEXT: FLAT_STORE_DWORD
160name: single_branch_successor_not_next_block
161
162body: |
163  bb.0:
164    successors: %bb.2
165    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
166   S_BRANCH %bb.2
167
168  bb.1:
169    FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
170    S_ENDPGM 0
171
172  bb.2:
173     $vgpr3_vgpr4 = V_LSHLREV_B64 4, $vgpr7_vgpr8, implicit $exec
174    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
175    S_ENDPGM 0
176...
177
178# CHECK-LABEL: name: preexisting_waitcnt{{$}}
179# CHECK: FLAT_LOAD_DWORD
180# CHECK-NEXT: S_WAITCNT 0
181# CHECK-NOT: S_WAITCNT
182name: preexisting_waitcnt
183tracksRegLiveness: true
184machineFunctionInfo:
185  isEntryFunction: true
186body: |
187  bb.0:
188    liveins: $vgpr1_vgpr2
189    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
190    S_WAITCNT 0
191    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
192
193...
194
195---
196
197# CHECK-LABEL: name: bundle_no_waitcnt{{$}}
198# CHECK: FLAT_LOAD_DWORD
199# CHECK-NEXT: BUNDLE
200# CHECK-NEXT: S_NOP
201# CHECK-NEXT: S_NOP
202# CHECK-NEXT: }
203# CHECK-NEXT: S_WAITCNT 112
204name: bundle_no_waitcnt
205tracksRegLiveness: true
206machineFunctionInfo:
207  isEntryFunction: true
208body: |
209  bb.0:
210    liveins: $vgpr1_vgpr2
211    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
212    BUNDLE {
213      S_NOP 0
214      S_NOP 0
215    }
216    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
217
218...
219
220---
221
222# See the waitcnt inside the bundle and don't insert an extra
223# CHECK-LABEL: name: preexisting_waitcnt_in_bundle{{$}}
224# CHECK: FLAT_LOAD_DWORD
225# CHECK: S_WAITCNT 0
226# CHECK-NOT: S_WAITCNT
227name: preexisting_waitcnt_in_bundle
228tracksRegLiveness: true
229machineFunctionInfo:
230  isEntryFunction: true
231body: |
232  bb.0:
233    liveins: $vgpr1_vgpr2
234    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
235    BUNDLE {
236      S_NOP 0
237      S_WAITCNT 0
238    }
239    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
240
241...
242
243---
244
245# Def and use inside bundle
246# CHECK-LABEL: name: insert_in_bundle{{$}}
247# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
248# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
249# CHECK-NEXT: S_WAITCNT 112
250# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
251# CHECK-NEXT: }
252
253name: insert_in_bundle
254tracksRegLiveness: true
255machineFunctionInfo:
256  isEntryFunction: true
257body: |
258  bb.0:
259    liveins: $vgpr1_vgpr2
260    BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
261    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
262    FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
263    }
264...
265
266---
267
268# Def is last instruction in bundle, use is outside bundle
269
270# CHECK-LABEL: name: exit_bundle{{$}}
271# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
272# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
273# CHECK-NEXT: }
274# CHECK-NEXT: S_WAITCNT 112
275# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
276
277name: exit_bundle
278tracksRegLiveness: true
279machineFunctionInfo:
280  isEntryFunction: true
281body: |
282  bb.0:
283    liveins: $vgpr1_vgpr2
284    BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
285    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
286    }
287
288    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
289
290...
291
292---
293
294# Def is in bundle, use is in another bundle
295
296# CHECK-LABEL: name: cross_bundle{{$}}
297# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
298# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
299# CHECK-NEXT: }
300# CHECK-NEXT: S_WAITCNT 112
301# CHECK-NEXT: BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 {
302# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
303# CHECK-NEXT: }
304
305name: cross_bundle
306tracksRegLiveness: true
307machineFunctionInfo:
308  isEntryFunction: true
309body: |
310  bb.0:
311    liveins: $vgpr1_vgpr2
312    BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
313    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
314    }
315    BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 {
316      FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
317    }
318...
319
320---
321# CHECK-LABEL: name: subregs16bit
322# CHECK: S_WAITCNT 112
323# CHECK-NEXT: V_NOP_e32
324
325name: subregs16bit
326machineFunctionInfo:
327  isEntryFunction: true
328body: |
329  bb.0:
330    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4
331      $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
332      $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
333      V_NOP_e32 implicit $exec, implicit $vgpr0_lo16, implicit $vgpr1_lo16
334...
335