1# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-insert-waitcnts  %s -o - | FileCheck -check-prefix=GFX89 %s
2# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts  %s -o - | FileCheck -check-prefix=GFX89 %s
3
4--- |
5  define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
6                                 <4 x i32> addrspace(1)* %global16,
7                                 i32* %flat4,
8                                 <4 x i32>* %flat16) {
9    ret void
10  }
11
12  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
13    ret void
14  }
15
16  define amdgpu_kernel void @single_branch_successor_not_next_block() {
17    ret void
18  }
19
20...
21---
22
23# CHECK-LABEL: name: flat_zero_waitcnt
24
25# CHECK-LABEL: bb.0:
26# CHECK: FLAT_LOAD_DWORD
27# CHECK: FLAT_LOAD_DWORDX4
28# Global loads will return in order so we should:
29# s_waitcnt vmcnt(1) lgkmcnt(1)
30# CHECK-NEXT: S_WAITCNT 369
31
32# CHECK-LABEL: bb.1:
33# CHECK: FLAT_LOAD_DWORD
34# GFX89: S_WAITCNT 112
35# CHECK: FLAT_LOAD_DWORDX4
36
37# CHECK-LABEL: bb.2:
38# CHECK: FLAT_LOAD_DWORD
39# GFX89: S_WAITCNT 112
40# CHECK: FLAT_LOAD_DWORDX4
41
42name: flat_zero_waitcnt
43
44body: |
45  bb.0:
46    successors: %bb.1
47    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
48    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
49    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
50    S_BRANCH %bb.1
51
52  bb.1:
53    successors: %bb.2
54    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr
55    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
56    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
57    S_BRANCH %bb.2
58
59  bb.2:
60    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
61    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16)
62    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
63    S_ENDPGM
64...
65---
66# There is only a single fallthrough successor block, so there's no
67# need to wait immediately.
68
69# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait
70# CHECK:   $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2
71# CHECK-NOT: S_WAITCNT
72
73# CHECK: bb.1:
74# CHECK-NEXT: V_LSHLREV_B64
75# CHECK-NEXT: S_WAITCNT 112
76# CHECK-NEXT: FLAT_STORE_DWORD
77name: single_fallthrough_successor_no_end_block_wait
78
79body: |
80  bb.0:
81    successors: %bb.1
82    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr
83
84  bb.1:
85    $vgpr3_vgpr4 = V_LSHLREV_B64 4, $vgpr7_vgpr8, implicit $exec
86    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, implicit $exec, implicit $flat_scr
87    S_ENDPGM
88...
89---
90# The block has a single predecessor with a single successor, but it
91# is not the next block so it's non-obvious that the wait is not needed.
92
93
94# CHECK-LABEL: name: single_branch_successor_not_next_block
95
96# CHECK: bb.1
97# CHECK-NEXT: FLAT_STORE_DWORD
98# CHECK-NEXT: S_ENDPGM
99
100# CHECK: bb.2:
101# CHECK-NEXT: V_LSHLREV_B64
102# CHECK-NEXT: S_WAITCNT 112
103# CHECK-NEXT: FLAT_STORE_DWORD
104name: single_branch_successor_not_next_block
105
106body: |
107  bb.0:
108    successors: %bb.2
109    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr
110   S_BRANCH %bb.2
111
112  bb.1:
113    FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, 0, implicit $exec, implicit $flat_scr
114    S_ENDPGM
115
116  bb.2:
117     $vgpr3_vgpr4 = V_LSHLREV_B64 4, $vgpr7_vgpr8, implicit $exec
118    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, implicit $exec, implicit $flat_scr
119    S_ENDPGM
120...
121