1; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
2
3target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
4
5declare i16 @llvm.ctlz.i16(i16, i1) readnone
6declare i32 @llvm.ctlz.i32(i32, i1) readnone
7declare i64 @llvm.ctlz.i64(i64, i1) readnone
8
9; There should be no difference between llvm.ctlz.i32(%a, true) and
10; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
11
12; CHECK-LABEL: myctlz(
13define i32 @myctlz(i32 %a) {
14; CHECK: ld.param.
15; CHECK-NEXT: clz.b32
16; CHECK-NEXT: st.param.
17; CHECK-NEXT: ret;
18  %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
19  ret i32 %val
20}
21; CHECK-LABEL: myctlz_2(
22define i32 @myctlz_2(i32 %a) {
23; CHECK: ld.param.
24; CHECK-NEXT: clz.b32
25; CHECK-NEXT: st.param.
26; CHECK-NEXT: ret;
27  %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
28  ret i32 %val
29}
30
31; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
32; value, so here we have to zero-extend it.
33; CHECK-LABEL: myctlz64(
34define i64 @myctlz64(i64 %a) {
35; CHECK: ld.param.
36; CHECK-NEXT: clz.b64
37; CHECK-NEXT: cvt.u64.u32
38; CHECK-NEXT: st.param.
39; CHECK-NEXT: ret;
40  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
41  ret i64 %val
42}
43; CHECK-LABEL: myctlz64_2(
44define i64 @myctlz64_2(i64 %a) {
45; CHECK: ld.param.
46; CHECK-NEXT: clz.b64
47; CHECK-NEXT: cvt.u64.u32
48; CHECK-NEXT: st.param.
49; CHECK-NEXT: ret;
50  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
51  ret i64 %val
52}
53
54; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
55; natural return width of ptx's clz.b64 instruction.  No conversions should be
56; necessary in the PTX.
57; CHECK-LABEL: myctlz64_as_32(
58define i32 @myctlz64_as_32(i64 %a) {
59; CHECK: ld.param.
60; CHECK-NEXT: clz.b64
61; CHECK-NEXT: st.param.
62; CHECK-NEXT: ret;
63  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
64  %trunc = trunc i64 %val to i32
65  ret i32 %trunc
66}
67; CHECK-LABEL: myctlz64_as_32_2(
68define i32 @myctlz64_as_32_2(i64 %a) {
69; CHECK: ld.param.
70; CHECK-NEXT: clz.b64
71; CHECK-NEXT: st.param.
72; CHECK-NEXT: ret;
73  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
74  %trunc = trunc i64 %val to i32
75  ret i32 %trunc
76}
77
78; ctlz.i16 is implemented by extending the input to i32, computing the result,
79; and then truncating the result back down to i16.  But the NVPTX ABI
80; zero-extends i16 return values to i32, so the final truncation doesn't appear
81; in this function.
82; CHECK-LABEL: myctlz_ret16(
83define i16 @myctlz_ret16(i16 %a) {
84; CHECK: ld.param.
85; CHECK-NEXT: cvt.u32.u16
86; CHECK-NEXT: clz.b32
87; CHECK-NEXT: sub.
88; CHECK-NEXT: st.param.
89; CHECK-NEXT: ret;
90  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
91  ret i16 %val
92}
93; CHECK-LABEL: myctlz_ret16_2(
94define i16 @myctlz_ret16_2(i16 %a) {
95; CHECK: ld.param.
96; CHECK-NEXT: cvt.u32.u16
97; CHECK-NEXT: clz.b32
98; CHECK-NEXT: sub.
99; CHECK-NEXT: st.param.
100; CHECK-NEXT: ret;
101  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
102  ret i16 %val
103}
104
105; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
106; remain.
107; CHECK-LABEL: myctlz_store16(
108define void @myctlz_store16(i16 %a, i16* %b) {
109; CHECK: ld.param.
110; CHECK-NEXT: cvt.u32.u16
111; CHECK-NEXT: clz.b32
112; CHECK-DAG: cvt.u16.u32
113; CHECK-DAG: sub.
114; CHECK: st.{{[a-z]}}16
115; CHECK: ret;
116  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
117  store i16 %val, i16* %b
118  ret void
119}
120; CHECK-LABEL: myctlz_store16_2(
121define void @myctlz_store16_2(i16 %a, i16* %b) {
122; CHECK: ld.param.
123; CHECK-NEXT: cvt.u32.u16
124; CHECK-NEXT: clz.b32
125; CHECK-DAG: cvt.u16.u32
126; CHECK-DAG: sub.
127; CHECK: st.{{[a-z]}}16
128; CHECK: ret;
129  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
130  store i16 %val, i16* %b
131  ret void
132}
133