1; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s 2 3target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 4 5declare i16 @llvm.ctlz.i16(i16, i1) readnone 6declare i32 @llvm.ctlz.i32(i32, i1) readnone 7declare i64 @llvm.ctlz.i64(i64, i1) readnone 8 9; There should be no difference between llvm.ctlz.i32(%a, true) and 10; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0. 11 12; CHECK-LABEL: myctlz( 13define i32 @myctlz(i32 %a) { 14; CHECK: ld.param. 15; CHECK-NEXT: clz.b32 16; CHECK-NEXT: st.param. 17; CHECK-NEXT: ret; 18 %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone 19 ret i32 %val 20} 21; CHECK-LABEL: myctlz_2( 22define i32 @myctlz_2(i32 %a) { 23; CHECK: ld.param. 24; CHECK-NEXT: clz.b32 25; CHECK-NEXT: st.param. 26; CHECK-NEXT: ret; 27 %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone 28 ret i32 %val 29} 30 31; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit 32; value, so here we have to zero-extend it. 33; CHECK-LABEL: myctlz64( 34define i64 @myctlz64(i64 %a) { 35; CHECK: ld.param. 36; CHECK-NEXT: clz.b64 37; CHECK-NEXT: cvt.u64.u32 38; CHECK-NEXT: st.param. 39; CHECK-NEXT: ret; 40 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 41 ret i64 %val 42} 43; CHECK-LABEL: myctlz64_2( 44define i64 @myctlz64_2(i64 %a) { 45; CHECK: ld.param. 46; CHECK-NEXT: clz.b64 47; CHECK-NEXT: cvt.u64.u32 48; CHECK-NEXT: st.param. 49; CHECK-NEXT: ret; 50 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone 51 ret i64 %val 52} 53 54; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the 55; natural return width of ptx's clz.b64 instruction. No conversions should be 56; necessary in the PTX. 57; CHECK-LABEL: myctlz64_as_32( 58define i32 @myctlz64_as_32(i64 %a) { 59; CHECK: ld.param. 60; CHECK-NEXT: clz.b64 61; CHECK-NEXT: st.param. 62; CHECK-NEXT: ret; 63 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 64 %trunc = trunc i64 %val to i32 65 ret i32 %trunc 66} 67; CHECK-LABEL: myctlz64_as_32_2( 68define i32 @myctlz64_as_32_2(i64 %a) { 69; CHECK: ld.param. 70; CHECK-NEXT: clz.b64 71; CHECK-NEXT: st.param. 72; CHECK-NEXT: ret; 73 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 74 %trunc = trunc i64 %val to i32 75 ret i32 %trunc 76} 77 78; ctlz.i16 is implemented by extending the input to i32, computing the result, 79; and then truncating the result back down to i16. But the NVPTX ABI 80; zero-extends i16 return values to i32, so the final truncation doesn't appear 81; in this function. 82; CHECK-LABEL: myctlz_ret16( 83define i16 @myctlz_ret16(i16 %a) { 84; CHECK: ld.param. 85; CHECK-NEXT: cvt.u32.u16 86; CHECK-NEXT: clz.b32 87; CHECK-NEXT: sub. 88; CHECK-NEXT: st.param. 89; CHECK-NEXT: ret; 90 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 91 ret i16 %val 92} 93; CHECK-LABEL: myctlz_ret16_2( 94define i16 @myctlz_ret16_2(i16 %a) { 95; CHECK: ld.param. 96; CHECK-NEXT: cvt.u32.u16 97; CHECK-NEXT: clz.b32 98; CHECK-NEXT: sub. 99; CHECK-NEXT: st.param. 100; CHECK-NEXT: ret; 101 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone 102 ret i16 %val 103} 104 105; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should 106; remain. 107; CHECK-LABEL: myctlz_store16( 108define void @myctlz_store16(i16 %a, i16* %b) { 109; CHECK: ld.param. 110; CHECK-NEXT: cvt.u32.u16 111; CHECK-NEXT: clz.b32 112; CHECK-DAG: cvt.u16.u32 113; CHECK-DAG: sub. 114; CHECK: st.{{[a-z]}}16 115; CHECK: ret; 116 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 117 store i16 %val, i16* %b 118 ret void 119} 120; CHECK-LABEL: myctlz_store16_2( 121define void @myctlz_store16_2(i16 %a, i16* %b) { 122; CHECK: ld.param. 123; CHECK-NEXT: cvt.u32.u16 124; CHECK-NEXT: clz.b32 125; CHECK-DAG: cvt.u16.u32 126; CHECK-DAG: sub. 127; CHECK: st.{{[a-z]}}16 128; CHECK: ret; 129 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 130 store i16 %val, i16* %b 131 ret void 132} 133