1; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs -disable-block-placement < %s | FileCheck -allow-deprecated-dag-overlap %s
2
3;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4;; This test checks that x86-cmov-converter optimization transform CMOV
5;; instruction into branches when it is profitable.
6;; There are 5 cases below:
7;;   1. CmovInCriticalPath:
8;;        CMOV depends on the condition and it is in the hot path.
9;;        Thus, it worths transforming.
10;;
11;;   2. CmovNotInCriticalPath:
12;;        Similar test like in (1), just that CMOV is not in the hot path.
13;;        Thus, it does not worth transforming.
14;;
15;;   3. MaxIndex:
16;;        Maximum calculation algorithm that is looking for the max index,
17;;        calculating CMOV value is cheaper than calculating CMOV condition.
18;;        Thus, it worths transforming.
19;;
20;;   4. MaxValue:
21;;        Maximum calculation algorithm that is looking for the max value,
22;;        calculating CMOV value is not cheaper than calculating CMOV condition.
23;;        Thus, it does not worth transforming.
24;;
25;;   5. BinarySearch:
26;;        Usually, binary search CMOV is not predicted.
27;;        Thus, it does not worth transforming.
28;;
29;; Test was created using the following command line:
30;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
31;; Where foo.c is:
32;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
34;;  for (int i = 0; i < n; i++) {
35;;    int t = c[i] + 1;
36;;    if (c[i] * a > b)
37;;      t = 10;
38;;    c[i] = (c[i] + 1) * t;
39;;  }
40;;}
41;;
42;;
43;;void CmovNotInHotPath(int n, int a, int b, int *c, int *d) {
44;;  for (int i = 0; i < n; i++) {
45;;    int t = c[i];
46;;    if (c[i] * a > b)
47;;      t = 10;
48;;    c[i] = t;
49;;    d[i] /= b;
50;;  }
51;;}
52;;
53;;
54;;int MaxIndex(int n, int *a) {
55;;  int t = 0;
56;;  for (int i = 1; i < n; i++) {
57;;    if (a[i] > a[t])
58;;      t = i;
59;;  }
60;;  return t;
61;;}
62;;
63;;
64;;int MaxValue(int n, int *a) {
65;;  int t = a[0];
66;;  for (int i = 1; i < n; i++) {
67;;    if (a[i] > t)
68;;      t = a[i];
69;;  }
70;;  return t;
71;;}
72;;
73;;typedef struct Node Node;
74;;struct Node {
75;;  unsigned Val;
76;;  Node *Right;
77;;  Node *Left;
78;;};
79;;
80;;unsigned BinarySearch(unsigned Mask, Node *Curr, Node *Next) {
81;;  while (Curr->Val > Next->Val) {
82;;    Curr = Next;
83;;    if (Mask & (0x1 << Curr->Val))
84;;      Next = Curr->Right;
85;;    else
86;;      Next = Curr->Left;
87;;  }
88;;  return Curr->Val;
89;;}
90;;
91;;
92;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
93;;  for (int i = 0; i < n; i++) {
94;;    int t = c[i];
95;;    if (c[i] * a > b)
96;;      t = 10;
97;;    c[i] = t;
98;;  }
99;;}
100;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
101
102%struct.Node = type { i32, %struct.Node*, %struct.Node* }
103
104; CHECK-LABEL: CmovInHotPath
105; CHECK-NOT: cmov
106; CHECK: jg
107
108define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
109entry:
110  %cmp14 = icmp sgt i32 %n, 0
111  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
112
113for.body.preheader:                               ; preds = %entry
114  %wide.trip.count = zext i32 %n to i64
115  br label %for.body
116
117for.cond.cleanup:                                 ; preds = %for.body, %entry
118  ret void
119
120for.body:                                         ; preds = %for.body.preheader, %for.body
121  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
122  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
123  %0 = load i32, i32* %arrayidx, align 4
124  %add = add nsw i32 %0, 1
125  %mul = mul nsw i32 %0, %a
126  %cmp3 = icmp sgt i32 %mul, %b
127  %. = select i1 %cmp3, i32 10, i32 %add
128  %mul7 = mul nsw i32 %., %add
129  store i32 %mul7, i32* %arrayidx, align 4
130  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
131  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
132  br i1 %exitcond, label %for.cond.cleanup, label %for.body
133}
134
135; CHECK-LABEL: CmovNotInHotPath
136; CHECK: cmovg
137
138define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture %d) #0 {
139entry:
140  %cmp18 = icmp sgt i32 %n, 0
141  br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup
142
143for.body.preheader:                               ; preds = %entry
144  %wide.trip.count = zext i32 %n to i64
145  br label %for.body
146
147for.cond.cleanup:                                 ; preds = %for.body, %entry
148  ret void
149
150for.body:                                         ; preds = %for.body.preheader, %for.body
151  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
152  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
153  %0 = load i32, i32* %arrayidx, align 4
154  %mul = mul nsw i32 %0, %a
155  %cmp3 = icmp sgt i32 %mul, %b
156  %. = select i1 %cmp3, i32 10, i32 %0
157  store i32 %., i32* %arrayidx, align 4
158  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
159  %1 = load i32, i32* %arrayidx7, align 4
160  %div = sdiv i32 %1, %b
161  store i32 %div, i32* %arrayidx7, align 4
162  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
163  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
164  br i1 %exitcond, label %for.cond.cleanup, label %for.body
165}
166
167; CHECK-LABEL: MaxIndex
168; CHECK-NOT: cmov
169; CHECK: jg
170
171define i32 @MaxIndex(i32 %n, i32* nocapture readonly %a) #0 {
172entry:
173  %cmp14 = icmp sgt i32 %n, 1
174  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
175
176for.body.preheader:                               ; preds = %entry
177  %wide.trip.count = zext i32 %n to i64
178  br label %for.body
179
180for.cond.cleanup:                                 ; preds = %for.body, %entry
181  %t.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.t.0, %for.body ]
182  ret i32 %t.0.lcssa
183
184for.body:                                         ; preds = %for.body.preheader, %for.body
185  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
186  %t.015 = phi i32 [ %i.0.t.0, %for.body ], [ 0, %for.body.preheader ]
187  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
188  %0 = load i32, i32* %arrayidx, align 4
189  %idxprom1 = sext i32 %t.015 to i64
190  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1
191  %1 = load i32, i32* %arrayidx2, align 4
192  %cmp3 = icmp sgt i32 %0, %1
193  %2 = trunc i64 %indvars.iv to i32
194  %i.0.t.0 = select i1 %cmp3, i32 %2, i32 %t.015
195  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
196  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
197  br i1 %exitcond, label %for.cond.cleanup, label %for.body
198}
199
200; CHECK-LABEL: MaxValue
201; CHECK-NOT: jg
202; CHECK: cmovg
203
204define i32 @MaxValue(i32 %n, i32* nocapture readonly %a) #0 {
205entry:
206  %0 = load i32, i32* %a, align 4
207  %cmp13 = icmp sgt i32 %n, 1
208  br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
209
210for.body.preheader:                               ; preds = %entry
211  %wide.trip.count = zext i32 %n to i64
212  br label %for.body
213
214for.cond.cleanup:                                 ; preds = %for.body, %entry
215  %t.0.lcssa = phi i32 [ %0, %entry ], [ %.t.0, %for.body ]
216  ret i32 %t.0.lcssa
217
218for.body:                                         ; preds = %for.body.preheader, %for.body
219  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
220  %t.014 = phi i32 [ %.t.0, %for.body ], [ %0, %for.body.preheader ]
221  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
222  %1 = load i32, i32* %arrayidx1, align 4
223  %cmp2 = icmp sgt i32 %1, %t.014
224  %.t.0 = select i1 %cmp2, i32 %1, i32 %t.014
225  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
226  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
227  br i1 %exitcond, label %for.cond.cleanup, label %for.body
228}
229
230; CHECK-LABEL: BinarySearch
231; CHECK: set
232
233define i32 @BinarySearch(i32 %Mask, %struct.Node* nocapture readonly %Curr, %struct.Node* nocapture readonly %Next) #0 {
234entry:
235  %Val8 = getelementptr inbounds %struct.Node, %struct.Node* %Curr, i64 0, i32 0
236  %0 = load i32, i32* %Val8, align 8
237  %Val19 = getelementptr inbounds %struct.Node, %struct.Node* %Next, i64 0, i32 0
238  %1 = load i32, i32* %Val19, align 8
239  %cmp10 = icmp ugt i32 %0, %1
240  br i1 %cmp10, label %while.body, label %while.end
241
242while.body:                                       ; preds = %entry, %while.body
243  %2 = phi i32 [ %4, %while.body ], [ %1, %entry ]
244  %Next.addr.011 = phi %struct.Node* [ %3, %while.body ], [ %Next, %entry ]
245  %shl = shl i32 1, %2
246  %and = and i32 %shl, %Mask
247  %tobool = icmp eq i32 %and, 0
248  %Left = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 2
249  %Right = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 1
250  %Left.sink = select i1 %tobool, %struct.Node** %Left, %struct.Node** %Right
251  %3 = load %struct.Node*, %struct.Node** %Left.sink, align 8
252  %Val1 = getelementptr inbounds %struct.Node, %struct.Node* %3, i64 0, i32 0
253  %4 = load i32, i32* %Val1, align 8
254  %cmp = icmp ugt i32 %2, %4
255  br i1 %cmp, label %while.body, label %while.end
256
257while.end:                                        ; preds = %while.body, %entry
258  %.lcssa = phi i32 [ %0, %entry ], [ %2, %while.body ]
259  ret i32 %.lcssa
260}
261
262;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
263;; The following test checks that x86-cmov-converter optimization transforms
264;; CMOV instructions into branch correctly.
265;;
266;; MBB:
267;;   cond = cmp ...
268;;   v1 = CMOVgt t1, f1, cond
269;;   v2 = CMOVle s1, f2, cond
270;;
271;; Where: t1 = 11, f1 = 22, f2 = a
272;;
273;; After CMOV transformation
274;; -------------------------
275;; MBB:
276;;   cond = cmp ...
277;;   ja %SinkMBB
278;;
279;; FalseMBB:
280;;   jmp %SinkMBB
281;;
282;; SinkMBB:
283;;   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
284;;   %v2 = phi[%f1, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
285;;                                          ; true-value with false-value
286;;                                          ; Phi instruction cannot use
287;;                                          ; previous Phi instruction result
288;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
289
290; CHECK-LABEL: Transform
291; CHECK-NOT: cmov
292; CHECK:         divl    [[a:%[0-9a-z]*]]
293; CHECK:         movl    $11, [[s1:%[0-9a-z]*]]
294; CHECK:         movl    [[a]], [[s2:%[0-9a-z]*]]
295; CHECK:         cmpl    [[a]], %edx
296; CHECK:         ja      [[SinkBB:.*]]
297; CHECK: [[FalseBB:.*]]:
298; CHECK:         movl    $22, [[s1]]
299; CHECK:         movl    $22, [[s2]]
300; CHECK: [[SinkBB]]:
301; CHECK:         ja
302
303define void @Transform(i32 *%arr, i32 *%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 {
304entry:
305  %cmp10 = icmp ugt i32 0, %n
306  br i1 %cmp10, label %while.body, label %while.end
307
308while.body:                                       ; preds = %entry, %while.body
309  %i = phi i32 [ %i_inc, %while.body ], [ 0, %entry ]
310  %arr_i = getelementptr inbounds i32, i32* %arr, i32 %i
311  %x = load i32, i32* %arr_i, align 4
312  %div = udiv i32 %x, %a
313  %cond = icmp ugt i32 %div, %a
314  %condOpp = icmp ule i32 %div, %a
315  %s1 = select i1 %cond, i32 11, i32 22
316  %s2 = select i1 %condOpp, i32 %s1, i32 %a
317  %sum = urem i32 %s1, %s2
318  store i32 %sum, i32* %arr_i, align 4
319  %i_inc = add i32 %i, 1
320  %cmp = icmp ugt i32 %i_inc, %n
321  br i1 %cmp, label %while.body, label %while.end
322
323while.end:                                        ; preds = %while.body, %entry
324  ret void
325}
326
327; Test that we always will convert a cmov with a memory operand into a branch,
328; even outside of a loop.
329define i32 @test_cmov_memoperand(i32 %a, i32 %b, i32 %x, i32* %y) #0 {
330; CHECK-LABEL: test_cmov_memoperand:
331entry:
332  %cond = icmp ugt i32 %a, %b
333; CHECK:         movl %edx, %eax
334; CHECK:         cmpl
335  %load = load i32, i32* %y
336  %z = select i1 %cond, i32 %x, i32 %load
337; CHECK-NOT:     cmov
338; CHECK:         ja [[FALSE_BB:.*]]
339; CHECK:         movl (%rcx), %eax
340; CHECK:       [[FALSE_BB]]:
341  ret i32 %z
342}
343
344; Test that we can convert a group of cmovs where only one has a memory
345; operand.
346define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
347; CHECK-LABEL: test_cmov_memoperand_in_group:
348entry:
349  %cond = icmp ugt i32 %a, %b
350; CHECK:         movl %edx, %eax
351; CHECK:         cmpl
352  %y = load i32, i32* %y.ptr
353  %z1 = select i1 %cond, i32 %x, i32 %a
354  %z2 = select i1 %cond, i32 %x, i32 %y
355  %z3 = select i1 %cond, i32 %x, i32 %b
356; CHECK-NOT:     cmov
357; CHECK:         ja [[FALSE_BB:.*]]
358; CHECK-DAG:     movl %{{.*}}, %[[R1:.*]]
359; CHECK-DAG:     movl (%r{{..}}), %[[R2:.*]]
360; CHECK-DAG:     movl %{{.*}} %eax
361; CHECK:       [[FALSE_BB]]:
362; CHECK:         addl
363; CHECK-DAG:       %[[R1]]
364; CHECK-DAG:       ,
365; CHECK-DAG:       %eax
366; CHECK-DAG:     addl
367; CHECK-DAG:       %[[R2]]
368; CHECK-DAG:       ,
369; CHECK-DAG:       %eax
370; CHECK:         retq
371  %s1 = add i32 %z1, %z2
372  %s2 = add i32 %s1, %z3
373  ret i32 %s2
374}
375
376; Same as before but with operands reversed in the select with a load.
377define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
378; CHECK-LABEL: test_cmov_memoperand_in_group2:
379entry:
380  %cond = icmp ugt i32 %a, %b
381; CHECK:         movl %edx, %eax
382; CHECK:         cmpl
383  %y = load i32, i32* %y.ptr
384  %z2 = select i1 %cond, i32 %a, i32 %x
385  %z1 = select i1 %cond, i32 %y, i32 %x
386  %z3 = select i1 %cond, i32 %b, i32 %x
387; CHECK-NOT:     cmov
388; CHECK:         jbe [[FALSE_BB:.*]]
389; CHECK-DAG:     movl %{{.*}}, %[[R1:.*]]
390; CHECK-DAG:     movl (%r{{..}}), %[[R2:.*]]
391; CHECK-DAG:     movl %{{.*}} %eax
392; CHECK:       [[FALSE_BB]]:
393; CHECK:         addl
394; CHECK-DAG:       %[[R1]]
395; CHECK-DAG:       ,
396; CHECK-DAG:       %eax
397; CHECK-DAG:     addl
398; CHECK-DAG:       %[[R2]]
399; CHECK-DAG:       ,
400; CHECK-DAG:       %eax
401; CHECK:         retq
402  %s1 = add i32 %z1, %z2
403  %s2 = add i32 %s1, %z3
404  ret i32 %s2
405}
406
407; Test that we don't convert a group of cmovs with conflicting directions of
408; loads.
409define i32 @test_cmov_memoperand_conflicting_dir(i32 %a, i32 %b, i32 %x, i32* %y1.ptr, i32* %y2.ptr) #0 {
410; CHECK-LABEL: test_cmov_memoperand_conflicting_dir:
411entry:
412  %cond = icmp ugt i32 %a, %b
413; CHECK:         cmpl
414  %y1 = load i32, i32* %y1.ptr
415  %y2 = load i32, i32* %y2.ptr
416  %z1 = select i1 %cond, i32 %x, i32 %y1
417  %z2 = select i1 %cond, i32 %y2, i32 %x
418; CHECK:         cmoval
419; CHECK:         cmoval
420  %s1 = add i32 %z1, %z2
421  ret i32 %s1
422}
423
424; Test that we can convert a group of cmovs where only one has a memory
425; operand and where that memory operand's registers come from a prior cmov in
426; the group.
427define i32 @test_cmov_memoperand_in_group_reuse_for_addr(i32 %a, i32 %b, i32* %x, i32* %y) #0 {
428; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr:
429entry:
430  %cond = icmp ugt i32 %a, %b
431; CHECK:         movl %edi, %eax
432; CHECK:         cmpl
433  %p = select i1 %cond, i32* %x, i32* %y
434  %load = load i32, i32* %p
435  %z = select i1 %cond, i32 %a, i32 %load
436; CHECK-NOT:     cmov
437; CHECK:         ja [[FALSE_BB:.*]]
438; CHECK:         movl (%r{{..}}), %eax
439; CHECK:       [[FALSE_BB]]:
440; CHECK:         retq
441  ret i32 %z
442}
443
444; Test that we can convert a group of two cmovs with memory operands where one
445; uses the result of the other as part of the address.
446define i32 @test_cmov_memoperand_in_group_reuse_for_addr2(i32 %a, i32 %b, i32* %x, i32** %y) #0 {
447; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr2:
448entry:
449  %cond = icmp ugt i32 %a, %b
450; CHECK:         movl %edi, %eax
451; CHECK:         cmpl
452  %load1 = load i32*, i32** %y
453  %p = select i1 %cond, i32* %x, i32* %load1
454  %load2 = load i32, i32* %p
455  %z = select i1 %cond, i32 %a, i32 %load2
456; CHECK-NOT:     cmov
457; CHECK:         ja [[FALSE_BB:.*]]
458; CHECK:         movq (%r{{..}}), %[[R1:.*]]
459; CHECK:         movl (%[[R1]]), %eax
460; CHECK:       [[FALSE_BB]]:
461; CHECK:         retq
462  ret i32 %z
463}
464
465; Test that we can convert a group of cmovs where only one has a memory
466; operand and where that memory operand's registers come from a prior cmov and
467; where that cmov gets *its* input from a prior cmov in the group.
468define i32 @test_cmov_memoperand_in_group_reuse_for_addr3(i32 %a, i32 %b, i32* %x, i32* %y, i32* %z) #0 {
469; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr3:
470entry:
471  %cond = icmp ugt i32 %a, %b
472; CHECK:         movl %edi, %eax
473; CHECK:         cmpl
474  %p = select i1 %cond, i32* %x, i32* %y
475  %p2 = select i1 %cond, i32* %z, i32* %p
476  %load = load i32, i32* %p2
477  %r = select i1 %cond, i32 %a, i32 %load
478; CHECK-NOT:     cmov
479; CHECK:         ja [[FALSE_BB:.*]]
480; CHECK:         movl (%r{{..}}), %eax
481; CHECK:       [[FALSE_BB]]:
482; CHECK:         retq
483  ret i32 %r
484}
485
486attributes #0 = {"target-cpu"="x86-64"}
487