1; RUN: opt %s -analyze -divergence | FileCheck %s
2
3target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
4target triple = "nvptx64-nvidia-cuda"
5
6; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
7define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
8; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge'
9entry:
10  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
11  %cond = icmp slt i32 %n, 0
12  br i1 %cond, label %then, label %else ; uniform
13; CHECK-NOT: DIVERGENT: br i1 %cond,
14then:
15  %a1 = add i32 %a, %tid
16  br label %merge
17else:
18  %b2 = add i32 %b, %tid
19  br label %merge
20merge:
21  %c = phi i32 [ %a1, %then ], [ %b2, %else ]
22  ret i32 %c
23}
24
25; c = a;
26; if (threadIdx.x < 5)    // divergent: data dependent
27;   c = b;
28; return c;               // c is divergent: sync dependent
29define i32 @sync(i32 %a, i32 %b) {
30; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync'
31bb1:
32  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
33  %cond = icmp slt i32 %tid, 5
34  br i1 %cond, label %bb2, label %bb3
35; CHECK: DIVERGENT: br i1 %cond,
36bb2:
37  br label %bb3
38bb3:
39  %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid
40; CHECK: DIVERGENT: %c =
41  ret i32 %c
42}
43
44; c = 0;
45; if (threadIdx.x >= 5) {  // divergent
46;   c = (n < 0 ? a : b);  // c here is uniform because n is uniform
47; }
48; // c here is divergent because it is sync dependent on threadIdx.x >= 5
49; return c;
50define i32 @mixed(i32 %n, i32 %a, i32 %b) {
51; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed'
52bb1:
53  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
54  %cond = icmp slt i32 %tid, 5
55  br i1 %cond, label %bb6, label %bb2
56; CHECK: DIVERGENT: br i1 %cond,
57bb2:
58  %cond2 = icmp slt i32 %n, 0
59  br i1 %cond2, label %bb4, label %bb3
60bb3:
61  br label %bb5
62bb4:
63  br label %bb5
64bb5:
65  %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ]
66; CHECK-NOT: DIVERGENT: %c =
67  br label %bb6
68bb6:
69  %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ]
70; CHECK: DIVERGENT: %c2 =
71  ret i32 %c2
72}
73
74; We conservatively treats all parameters of a __device__ function as divergent.
75define i32 @device(i32 %n, i32 %a, i32 %b) {
76; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device'
77; CHECK: DIVERGENT: i32 %n
78; CHECK: DIVERGENT: i32 %a
79; CHECK: DIVERGENT: i32 %b
80entry:
81  %cond = icmp slt i32 %n, 0
82  br i1 %cond, label %then, label %else
83; CHECK: DIVERGENT: br i1 %cond,
84then:
85  br label %merge
86else:
87  br label %merge
88merge:
89  %c = phi i32 [ %a, %then ], [ %b, %else ]
90  ret i32 %c
91}
92
93; int i = 0;
94; do {
95;   i++;                  // i here is uniform
96; } while (i < laneid);
97; return i == 10 ? 0 : 1; // i here is divergent
98;
99; The i defined in the loop is used outside.
100define i32 @loop() {
101; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop'
102entry:
103  %laneid = call i32 @llvm.ptx.read.laneid()
104  br label %loop
105loop:
106  %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
107; CHECK-NOT: DIVERGENT: %i =
108  %i1 = add i32 %i, 1
109  %exit_cond = icmp sge i32 %i1, %laneid
110  br i1 %exit_cond, label %loop_exit, label %loop
111loop_exit:
112  %cond = icmp eq i32 %i, 10
113  br i1 %cond, label %then, label %else
114; CHECK: DIVERGENT: br i1 %cond,
115then:
116  ret i32 0
117else:
118  ret i32 1
119}
120
121; Same as @loop, but the loop is in the LCSSA form.
122define i32 @lcssa() {
123; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa'
124entry:
125  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
126  br label %loop
127loop:
128  %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
129; CHECK-NOT: DIVERGENT: %i =
130  %i1 = add i32 %i, 1
131  %exit_cond = icmp sge i32 %i1, %tid
132  br i1 %exit_cond, label %loop_exit, label %loop
133loop_exit:
134  %i.lcssa = phi i32 [ %i, %loop ]
135; CHECK: DIVERGENT: %i.lcssa =
136  %cond = icmp eq i32 %i.lcssa, 10
137  br i1 %cond, label %then, label %else
138; CHECK: DIVERGENT: br i1 %cond,
139then:
140  ret i32 0
141else:
142  ret i32 1
143}
144
145; This test contains an unstructured loop.
146;           +-------------- entry ----------------+
147;           |                                     |
148;           V                                     V
149; i1 = phi(0, i3)                            i2 = phi(0, i3)
150;     j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2
151;           ^                 |                   ^
152;           |                 V                   |
153;           +-------- switch (tid / i3) ----------+
154;                             |
155;                             V
156;                        if (i3 == 5) // divergent
157; because sync dependent on (tid / i3).
158define i32 @unstructured_loop(i1 %entry_cond) {
159; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop'
160entry:
161  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
162  br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
163loop_entry_1:
164  %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
165  %j1 = add i32 %i1, 1
166  br label %loop_body
167loop_entry_2:
168  %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
169  %j2 = add i32 %i2, 2
170  br label %loop_body
171loop_body:
172  %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
173  br label %loop_latch
174loop_latch:
175  %div = sdiv i32 %tid, %i3
176  switch i32 %div, label %branch [ i32 1, label %loop_entry_1
177                                   i32 2, label %loop_entry_2 ]
178branch:
179  %cmp = icmp eq i32 %i3, 5
180  br i1 %cmp, label %then, label %else
181; CHECK: DIVERGENT: br i1 %cmp,
182then:
183  ret i32 0
184else:
185  ret i32 1
186}
187
188; Verifies sync-dependence is computed correctly in the absense of loops.
189define i32 @sync_no_loop(i32 %arg) {
190entry:
191  %0 = add i32 %arg, 1
192  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
193  %1 = icmp sge i32 %tid, 10
194  br i1 %1, label %bb1, label %bb2
195
196bb1:
197  br label %bb3
198
199bb2:
200  br label %bb3
201
202bb3:
203  %2 = add i32 %0, 2
204  ; CHECK-NOT: DIVERGENT: %2
205  ret i32 %2
206}
207
208declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
209declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
210declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
211declare i32 @llvm.ptx.read.laneid()
212
213!nvvm.annotations = !{!0, !1, !2, !3, !4, !5}
214!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1}
215!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1}
216!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1}
217!3 = !{i32 ()* @loop, !"kernel", i32 1}
218!4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1}
219!5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1}
220