1; RUN: llc -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 2 3declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 4declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 5declare i32 @llvm.amdgcn.workitem.id.x() 6declare i32 @llvm.amdgcn.workitem.id.y() 7 8; GCN-LABEL: {{^}}v_permlane16_b32_vss: 9; GFX10-NOT: v_readfirstlane_b32 10; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}} 11define amdgpu_kernel void @v_permlane16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 12 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0) 13 store i32 %v, i32 addrspace(1)* %out 14 ret void 15} 16 17; GCN-LABEL: {{^}}v_permlane16_b32_vii: 18; GFX10-NOT: v_readfirstlane_b32 19; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}} 20define amdgpu_kernel void @v_permlane16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 { 21 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0) 22 store i32 %v, i32 addrspace(1)* %out 23 ret void 24} 25 26; GCN-LABEL: {{^}}v_permlane16_b32_vll: 27; FIXME-GFX10: It is allowed to have both immediates as literals 28; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234 29; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1 30; GFX10-NOT: v_readfirstlane_b32 31; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}} 32define amdgpu_kernel void @v_permlane16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 { 33 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0) 34 store i32 %v, i32 addrspace(1)* %out 35 ret void 36} 37 38; GCN-LABEL: {{^}}v_permlane16_b32_vvv: 39; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0 40; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1 41; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}} 42define amdgpu_kernel void @v_permlane16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 { 43 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 44 %tidy = call i32 @llvm.amdgcn.workitem.id.y() 45 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0) 46 store i32 %v, i32 addrspace(1)* %out 47 ret void 48} 49 50; GCN-LABEL: {{^}}v_permlane16_b32_vvs: 51; GFX10-NOT: v_readfirstlane_b32 52; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0 53; GFX10-NOT: v_readfirstlane_b32 54; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}} 55define amdgpu_kernel void @v_permlane16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 { 56 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 57 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0) 58 store i32 %v, i32 addrspace(1)* %out 59 ret void 60} 61 62; GCN-LABEL: {{^}}v_permlane16_b32_vsv: 63; GFX10-NOT: v_readfirstlane_b32 64; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1 65; GFX10-NOT: v_readfirstlane_b32 66; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}} 67define amdgpu_kernel void @v_permlane16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { 68 %tidy = call i32 @llvm.amdgcn.workitem.id.y() 69 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0) 70 store i32 %v, i32 addrspace(1)* %out 71 ret void 72} 73 74; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi: 75; GFX10-NOT: v_readfirstlane_b32 76; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}} 77define amdgpu_kernel void @v_permlane16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 78 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0) 79 store i32 %v, i32 addrspace(1)* %out 80 ret void 81} 82 83; GCN-LABEL: {{^}}v_permlane16_b32_vss_bc: 84; GFX10-NOT: v_readfirstlane_b32 85; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}} 86define amdgpu_kernel void @v_permlane16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 87 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1) 88 store i32 %v, i32 addrspace(1)* %out 89 ret void 90} 91 92; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi_bc: 93; GFX10-NOT: v_readfirstlane_b32 94; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}} 95define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 96 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1) 97 store i32 %v, i32 addrspace(1)* %out 98 ret void 99} 100 101; GCN-LABEL: {{^}}v_permlanex16_b32_vss: 102; GFX10-NOT: v_readfirstlane_b32 103; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}} 104define amdgpu_kernel void @v_permlanex16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 105 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0) 106 store i32 %v, i32 addrspace(1)* %out 107 ret void 108} 109 110; GCN-LABEL: {{^}}v_permlanex16_b32_vii: 111; GFX10-NOT: v_readfirstlane_b32 112; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}} 113define amdgpu_kernel void @v_permlanex16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 { 114 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0) 115 store i32 %v, i32 addrspace(1)* %out 116 ret void 117} 118 119; GCN-LABEL: {{^}}v_permlanex16_b32_vll: 120; FIXME-GFX10: It is allowed to have both immediates as literals 121; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234 122; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1 123; GFX10-NOT: v_readfirstlane_b32 124; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}} 125define amdgpu_kernel void @v_permlanex16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 { 126 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0) 127 store i32 %v, i32 addrspace(1)* %out 128 ret void 129} 130 131; GCN-LABEL: {{^}}v_permlanex16_b32_vvv: 132; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0 133; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1 134; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}} 135define amdgpu_kernel void @v_permlanex16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 { 136 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 137 %tidy = call i32 @llvm.amdgcn.workitem.id.y() 138 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0) 139 store i32 %v, i32 addrspace(1)* %out 140 ret void 141} 142 143; GCN-LABEL: {{^}}v_permlanex16_b32_vvs: 144; GFX10-NOT: v_readfirstlane_b32 145; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0 146; GFX10-NOT: v_readfirstlane_b32 147; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}} 148define amdgpu_kernel void @v_permlanex16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 { 149 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 150 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0) 151 store i32 %v, i32 addrspace(1)* %out 152 ret void 153} 154 155; GCN-LABEL: {{^}}v_permlanex16_b32_vsv: 156; GFX10-NOT: v_readfirstlane_b32 157; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1 158; GFX10-NOT: v_readfirstlane_b32 159; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}} 160define amdgpu_kernel void @v_permlanex16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { 161 %tidy = call i32 @llvm.amdgcn.workitem.id.y() 162 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0) 163 store i32 %v, i32 addrspace(1)* %out 164 ret void 165} 166 167; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi: 168; GFX10-NOT: v_readfirstlane_b32 169; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}} 170define amdgpu_kernel void @v_permlanex16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 171 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0) 172 store i32 %v, i32 addrspace(1)* %out 173 ret void 174} 175 176; GCN-LABEL: {{^}}v_permlanex16_b32_vss_bc: 177; GFX10-NOT: v_readfirstlane_b32 178; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}} 179define amdgpu_kernel void @v_permlanex16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 180 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1) 181 store i32 %v, i32 addrspace(1)* %out 182 ret void 183} 184 185; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi_bc: 186; GFX10-NOT: v_readfirstlane_b32 187; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}} 188define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 189 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1) 190 store i32 %v, i32 addrspace(1)* %out 191 ret void 192} 193 194; GCN-LABEL: {{^}}v_permlane16_b32_tid_tid: 195; GFX10: v_permlane16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} 196define amdgpu_kernel void @v_permlane16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 197 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 198 %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) 199 store i32 %v, i32 addrspace(1)* %out 200 ret void 201} 202 203; GCN-LABEL: {{^}}v_permlane16_b32_undef_tid: 204; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} 205define amdgpu_kernel void @v_permlane16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 206 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 207 %v = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) 208 store i32 %v, i32 addrspace(1)* %out 209 ret void 210} 211 212; GCN-LABEL: {{^}}v_permlane16_b32_i_tid: 213; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039 214; GFX10: v_permlane16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} 215define amdgpu_kernel void @v_permlane16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 216 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 217 %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) 218 store i32 %v, i32 addrspace(1)* %out 219 ret void 220} 221 222; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi: 223; GFX10-NOT: 0x3039 224; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}} 225define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 226 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 227 %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0) 228 store i32 %v, i32 addrspace(1)* %out 229 ret void 230} 231 232; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_bc: 233; GFX10-NOT: 0x3039 234; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}} 235define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 236 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 237 %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1) 238 store i32 %v, i32 addrspace(1)* %out 239 ret void 240} 241 242; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi_bc: 243; GFX10-NOT: 0x3039 244; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}} 245define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 246 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 247 %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1) 248 store i32 %v, i32 addrspace(1)* %out 249 ret void 250} 251 252; GCN-LABEL: {{^}}v_permlanex16_b32_tid_tid: 253; GFX10: v_permlanex16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} 254define amdgpu_kernel void @v_permlanex16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 255 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 256 %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) 257 store i32 %v, i32 addrspace(1)* %out 258 ret void 259} 260 261; GCN-LABEL: {{^}}v_permlanex16_b32_undef_tid: 262; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} 263define amdgpu_kernel void @v_permlanex16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 264 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 265 %v = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) 266 store i32 %v, i32 addrspace(1)* %out 267 ret void 268} 269 270; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid: 271; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039 272; GFX10: v_permlanex16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} 273define amdgpu_kernel void @v_permlanex16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 274 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 275 %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) 276 store i32 %v, i32 addrspace(1)* %out 277 ret void 278} 279 280; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi: 281; GFX10-NOT: 0x3039 282; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}} 283define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 284 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 285 %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0) 286 store i32 %v, i32 addrspace(1)* %out 287 ret void 288} 289 290; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_bc: 291; GFX10-NOT: 0x3039 292; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}} 293define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 294 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 295 %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1) 296 store i32 %v, i32 addrspace(1)* %out 297 ret void 298} 299 300; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi_bc: 301; GFX10-NOT: 0x3039 302; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}} 303define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { 304 %tidx = call i32 @llvm.amdgcn.workitem.id.x() 305 %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1) 306 store i32 %v, i32 addrspace(1)* %out 307 ret void 308} 309 310attributes #0 = { nounwind readnone convergent } 311attributes #1 = { nounwind } 312