1; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s 2 3declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32) 4declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32) 5declare {i32, i1} @llvm.nvvm.shfl.sync.up.i32p(i32, i32, i32, i32) 6declare {float, i1} @llvm.nvvm.shfl.sync.up.f32p(i32, float, i32, i32) 7declare {i32, i1} @llvm.nvvm.shfl.sync.bfly.i32p(i32, i32, i32, i32) 8declare {float, i1} @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32) 9declare {i32, i1} @llvm.nvvm.shfl.sync.idx.i32p(i32, i32, i32, i32) 10declare {float, i1} @llvm.nvvm.shfl.sync.idx.f32p(i32, float, i32, i32) 11 12; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rrr 13define {i32, i1} @shfl.sync.i32.rrr(i32 %mask, i32 %a, i32 %b, i32 %c) { 14 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] 15 ; CHECK: ld.param.u32 [[A:%r[0-9]+]] 16 ; CHECK: ld.param.u32 [[B:%r[0-9]+]] 17 ; CHECK: ld.param.u32 [[C:%r[0-9]+]] 18 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]]; 19 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 20 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 %c) 21 ret {i32, i1} %val 22} 23 24; CHECK-LABEL: .func{{.*}}shfl.sync.i32.irr 25define {i32, i1} @shfl.sync.i32.irr(i32 %a, i32 %b, i32 %c) { 26 ; CHECK: ld.param.u32 [[A:%r[0-9]+]] 27 ; CHECK: ld.param.u32 [[B:%r[0-9]+]] 28 ; CHECK: ld.param.u32 [[C:%r[0-9]+]] 29 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1; 30 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 31 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 %c) 32 ret {i32, i1} %val 33} 34 35; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rri 36define {i32, i1} @shfl.sync.i32.rri(i32 %mask, i32 %a, i32 %b) { 37 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] 38 ; CHECK: ld.param.u32 [[A:%r[0-9]+]] 39 ; CHECK: ld.param.u32 [[B:%r[0-9]+]] 40 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]]; 41 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 42 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 1) 43 ret {i32, i1} %val 44} 45 46; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iri 47define {i32, i1} @shfl.sync.i32.iri(i32 %a, i32 %b) { 48 ; CHECK: ld.param.u32 [[A:%r[0-9]+]] 49 ; CHECK: ld.param.u32 [[B:%r[0-9]+]] 50 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1; 51 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 52 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 2) 53 ret {i32, i1} %val 54} 55 56; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rir 57define {i32, i1} @shfl.sync.i32.rir(i32 %mask, i32 %a, i32 %c) { 58 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] 59 ; CHECK: ld.param.u32 [[A:%r[0-9]+]] 60 ; CHECK: ld.param.u32 [[C:%r[0-9]+]] 61 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]]; 62 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 63 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 %c) 64 ret {i32, i1} %val 65} 66 67; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iir 68define {i32, i1} @shfl.sync.i32.iir(i32 %a, i32 %c) { 69 ; CHECK: ld.param.u32 [[A:%r[0-9]+]] 70 ; CHECK: ld.param.u32 [[C:%r[0-9]+]] 71 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1; 72 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 73 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 %c) 74 ret {i32, i1} %val 75} 76 77; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rii 78define {i32, i1} @shfl.sync.i32.rii(i32 %mask, i32 %a) { 79 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] 80 ; CHECK: ld.param.u32 [[A:%r[0-9]+]] 81 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]]; 82 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 83 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 2) 84 ret {i32, i1} %val 85} 86 87; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iii 88define {i32, i1} @shfl.sync.i32.iii(i32 %a, i32 %b) { 89 ; CHECK: ld.param.u32 [[A:%r[0-9]+]] 90 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1; 91 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 92 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 3) 93 ret {i32, i1} %val 94} 95 96;; Same intrinsics, but for float 97 98; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rrr 99define {float, i1} @shfl.sync.f32.rrr(i32 %mask, float %a, i32 %b, i32 %c) { 100 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] 101 ; CHECK: ld.param.f32 [[A:%f[0-9]+]] 102 ; CHECK: ld.param.u32 [[B:%r[0-9]+]] 103 ; CHECK: ld.param.u32 [[C:%r[0-9]+]] 104 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]]; 105 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 106 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 %c) 107 ret {float, i1} %val 108} 109 110; CHECK-LABEL: .func{{.*}}shfl.sync.f32.irr 111define {float, i1} @shfl.sync.f32.irr(float %a, i32 %b, i32 %c) { 112 ; CHECK: ld.param.f32 [[A:%f[0-9]+]] 113 ; CHECK: ld.param.u32 [[B:%r[0-9]+]] 114 ; CHECK: ld.param.u32 [[C:%r[0-9]+]] 115 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1; 116 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 117 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 %c) 118 ret {float, i1} %val 119} 120 121; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rri 122define {float, i1} @shfl.sync.f32.rri(i32 %mask, float %a, i32 %b) { 123 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] 124 ; CHECK: ld.param.f32 [[A:%f[0-9]+]] 125 ; CHECK: ld.param.u32 [[B:%r[0-9]+]] 126 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]]; 127 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 128 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 1) 129 ret {float, i1} %val 130} 131 132; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iri 133define {float, i1} @shfl.sync.f32.iri(float %a, i32 %b) { 134 ; CHECK: ld.param.f32 [[A:%f[0-9]+]] 135 ; CHECK: ld.param.u32 [[B:%r[0-9]+]] 136 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1; 137 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 138 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 2) 139 ret {float, i1} %val 140} 141 142; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rir 143define {float, i1} @shfl.sync.f32.rir(i32 %mask, float %a, i32 %c) { 144 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] 145 ; CHECK: ld.param.f32 [[A:%f[0-9]+]] 146 ; CHECK: ld.param.u32 [[C:%r[0-9]+]] 147 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]]; 148 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 149 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 %c) 150 ret {float, i1} %val 151} 152 153; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iir 154define {float, i1} @shfl.sync.f32.iir(float %a, i32 %c) { 155 ; CHECK: ld.param.f32 [[A:%f[0-9]+]] 156 ; CHECK: ld.param.u32 [[C:%r[0-9]+]] 157 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1; 158 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 159 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 %c) 160 ret {float, i1} %val 161} 162 163; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rii 164define {float, i1} @shfl.sync.f32.rii(i32 %mask, float %a) { 165 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] 166 ; CHECK: ld.param.f32 [[A:%f[0-9]+]] 167 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]]; 168 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 169 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 2) 170 ret {float, i1} %val 171} 172 173; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iii 174define {float, i1} @shfl.sync.f32.iii(float %a, i32 %b) { 175 ; CHECK: ld.param.f32 [[A:%f[0-9]+]] 176 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1; 177 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] 178 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 3) 179 ret {float, i1} %val 180} 181