1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4; GCN-LABEL: {{^}}select_f16: 5; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 6; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 7; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 8; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 9; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 10; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 11; SI-DAG: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 12; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 13; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 14; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 15; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 16; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 17; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 18; GCN: buffer_store_short v[[R_F16]] 19; GCN: s_endpgm 20define amdgpu_kernel void @select_f16( 21 half addrspace(1)* %r, 22 half addrspace(1)* %a, 23 half addrspace(1)* %b, 24 half addrspace(1)* %c, 25 half addrspace(1)* %d) { 26entry: 27 %a.val = load volatile half, half addrspace(1)* %a 28 %b.val = load volatile half, half addrspace(1)* %b 29 %c.val = load volatile half, half addrspace(1)* %c 30 %d.val = load volatile half, half addrspace(1)* %d 31 %fcmp = fcmp olt half %a.val, %b.val 32 %r.val = select i1 %fcmp, half %c.val, half %d.val 33 store half %r.val, half addrspace(1)* %r 34 ret void 35} 36 37; GCN-LABEL: {{^}}select_f16_imm_a: 38; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 39; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 40; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 41; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 42; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]] 43; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 44; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 45; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 46; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 47; VI: v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]] 48; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 49; GCN: buffer_store_short v[[R_F16]] 50; GCN: s_endpgm 51define amdgpu_kernel void @select_f16_imm_a( 52 half addrspace(1)* %r, 53 half addrspace(1)* %b, 54 half addrspace(1)* %c, 55 half addrspace(1)* %d) { 56entry: 57 %b.val = load volatile half, half addrspace(1)* %b 58 %c.val = load volatile half, half addrspace(1)* %c 59 %d.val = load volatile half, half addrspace(1)* %d 60 %fcmp = fcmp olt half 0xH3800, %b.val 61 %r.val = select i1 %fcmp, half %c.val, half %d.val 62 store half %r.val, half addrspace(1)* %r 63 ret void 64} 65 66; GCN-LABEL: {{^}}select_f16_imm_b: 67; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 68; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 69; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 70; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 71; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]] 72; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 73; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 74; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 75; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 76 77; VI: v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]] 78; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 79; GCN: buffer_store_short v[[R_F16]] 80; GCN: s_endpgm 81define amdgpu_kernel void @select_f16_imm_b( 82 half addrspace(1)* %r, 83 half addrspace(1)* %a, 84 half addrspace(1)* %c, 85 half addrspace(1)* %d) { 86entry: 87 %a.val = load volatile half, half addrspace(1)* %a 88 %c.val = load volatile half, half addrspace(1)* %c 89 %d.val = load volatile half, half addrspace(1)* %d 90 %fcmp = fcmp olt half %a.val, 0xH3800 91 %r.val = select i1 %fcmp, half %c.val, half %d.val 92 store half %r.val, half addrspace(1)* %r 93 ret void 94} 95 96; GCN-LABEL: {{^}}select_f16_imm_c: 97; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 98; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 99; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 100; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 101; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 102; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 103; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 104; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc 105; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 106 107; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}} 108; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 109; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc 110; GCN: buffer_store_short v[[R_F16]] 111; GCN: s_endpgm 112define amdgpu_kernel void @select_f16_imm_c( 113 half addrspace(1)* %r, 114 half addrspace(1)* %a, 115 half addrspace(1)* %b, 116 half addrspace(1)* %d) { 117entry: 118 %a.val = load volatile half, half addrspace(1)* %a 119 %b.val = load volatile half, half addrspace(1)* %b 120 %d.val = load volatile half, half addrspace(1)* %d 121 %fcmp = fcmp olt half %a.val, %b.val 122 %r.val = select i1 %fcmp, half 0xH3800, half %d.val 123 store half %r.val, half addrspace(1)* %r 124 ret void 125} 126 127; GCN-LABEL: {{^}}select_f16_imm_d: 128; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 129; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 130; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 131; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 132; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 133; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 134; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 135; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]] 136; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 137; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}} 138; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 139; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 140; GCN: buffer_store_short v[[R_F16]] 141; GCN: s_endpgm 142define amdgpu_kernel void @select_f16_imm_d( 143 half addrspace(1)* %r, 144 half addrspace(1)* %a, 145 half addrspace(1)* %b, 146 half addrspace(1)* %c) { 147entry: 148 %a.val = load volatile half, half addrspace(1)* %a 149 %b.val = load volatile half, half addrspace(1)* %b 150 %c.val = load volatile half, half addrspace(1)* %c 151 %fcmp = fcmp olt half %a.val, %b.val 152 %r.val = select i1 %fcmp, half %c.val, half 0xH3800 153 store half %r.val, half addrspace(1)* %r 154 ret void 155} 156 157; GCN-LABEL: {{^}}select_v2f16: 158; SI: v_cvt_f32_f16_e32 159; SI: v_cvt_f32_f16_e32 160; SI: v_cvt_f32_f16_e32 161; SI: v_cvt_f32_f16_e32 162; SI: v_cmp_lt_f32_e32 163; SI: v_cndmask_b32_e32 164; SI: v_cmp_lt_f32_e32 165; SI: v_cndmask_b32_e32 166; SI: v_cvt_f16_f32_e32 167; SI: v_cvt_f16_f32_e32 168 169; VI: v_cmp_lt_f16_e32 170; VI: v_cndmask_b32_e32 171; VI: v_cmp_lt_f16_e32 172; VI: v_cndmask_b32_e32 173 174; GCN: s_endpgm 175define amdgpu_kernel void @select_v2f16( 176 <2 x half> addrspace(1)* %r, 177 <2 x half> addrspace(1)* %a, 178 <2 x half> addrspace(1)* %b, 179 <2 x half> addrspace(1)* %c, 180 <2 x half> addrspace(1)* %d) { 181entry: 182 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 183 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 184 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 185 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 186 %fcmp = fcmp olt <2 x half> %a.val, %b.val 187 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 188 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 189 ret void 190} 191 192; GCN-LABEL: {{^}}select_v2f16_imm_a: 193; SI: v_cvt_f32_f16_e32 194; SI: v_cvt_f32_f16_e32 195; SI: v_cvt_f32_f16_e32 196; SI: v_cvt_f32_f16_e32 197; SI: v_cvt_f32_f16_e32 198; SI: v_cvt_f32_f16_e32 199 200; SI: v_cmp_gt_f32_e32 201; SI: v_cndmask_b32_e32 202 ; SI: v_cmp_lt_f32_e32 vcc, 0.5 203; SI: v_cndmask_b32_e32 204 205; VI: v_cmp_lt_f16_e32 206; VI: v_cndmask_b32_e32 207; VI: v_cmp_gt_f16_e32 208; VI: v_cndmask_b32_e32 209 210; SI: v_cvt_f16_f32_e32 211; SI: v_cvt_f16_f32_e32 212; GCN: s_endpgm 213define amdgpu_kernel void @select_v2f16_imm_a( 214 <2 x half> addrspace(1)* %r, 215 <2 x half> addrspace(1)* %b, 216 <2 x half> addrspace(1)* %c, 217 <2 x half> addrspace(1)* %d) { 218entry: 219 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 220 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 221 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 222 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val 223 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 224 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 225 ret void 226} 227 228; GCN-LABEL: {{^}}select_v2f16_imm_b: 229; SI: v_cvt_f32_f16_e32 230; SI: v_cvt_f32_f16_e32 231; SI: v_cvt_f32_f16_e32 232; SI: v_cvt_f32_f16_e32 233; SI: v_cvt_f32_f16_e32 234; SI: v_cvt_f32_f16_e32 235 236; SI: v_cmp_lt_f32_e32 237; SI: v_cndmask_b32_e32 238; SI: v_cmp_gt_f32_e32 vcc, 0.5 239; SI: v_cndmask_b32_e32 240 241; VI: v_cmp_gt_f16_e32 242; VI: v_cndmask_b32_e32 243; VI: v_cmp_lt_f16_e32 244; VI: v_cndmask_b32_e32 245 246; SI: v_cvt_f16_f32_e32 247; SI: v_cvt_f16_f32_e32 248; GCN: s_endpgm 249define amdgpu_kernel void @select_v2f16_imm_b( 250 <2 x half> addrspace(1)* %r, 251 <2 x half> addrspace(1)* %a, 252 <2 x half> addrspace(1)* %c, 253 <2 x half> addrspace(1)* %d) { 254entry: 255 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 256 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 257 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 258 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900> 259 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 260 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 261 ret void 262} 263 264; GCN-LABEL: {{^}}select_v2f16_imm_c: 265; SI: v_cvt_f32_f16_e32 266; SI: v_cvt_f32_f16_e32 267; SI: v_cvt_f32_f16_e32 268; SI: v_cvt_f32_f16_e32 269; SI: v_cvt_f32_f16_e32 270; SI: v_cvt_f32_f16_e32 271 272; SI: v_cmp_nlt_f32_e32 273; SI: v_cndmask_b32_e32 274; SI: v_cmp_nlt_f32_e32 275; SI-DAG: v_cndmask_b32_e32 276 277; VI: v_cmp_nlt_f16_e32 278; VI: v_cndmask_b32_e32 279 280; VI: v_cmp_nlt_f16_e32 281; VI: v_cndmask_b32_e32 282 283; SI-DAG: v_cvt_f16_f32_e32 284; SI: v_cvt_f16_f32_e32 285; GCN: s_endpgm 286define amdgpu_kernel void @select_v2f16_imm_c( 287 <2 x half> addrspace(1)* %r, 288 <2 x half> addrspace(1)* %a, 289 <2 x half> addrspace(1)* %b, 290 <2 x half> addrspace(1)* %d) { 291entry: 292 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 293 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 294 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 295 %fcmp = fcmp olt <2 x half> %a.val, %b.val 296 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val 297 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 298 ret void 299} 300 301; GCN-LABEL: {{^}}select_v2f16_imm_d: 302; SI: v_cvt_f32_f16_e32 303; SI: v_cvt_f32_f16_e32 304; SI: v_cvt_f32_f16_e32 305; SI: v_cvt_f32_f16_e32 306; SI: v_cvt_f32_f16_e32 307; SI: v_cvt_f32_f16_e32 308 309; SI: v_cmp_lt_f32_e32 310; SI: v_cndmask_b32 311; SI: v_cmp_lt_f32_e32 312; SI: v_cndmask_b32 313 314; VI: v_cmp_lt_f16_e32 315; VI: v_cndmask_b32 316; VI: v_cmp_lt_f16_e32 317; VI: v_cndmask_b32 318 319; SI: v_cvt_f16_f32_e32 320; SI: v_cvt_f16_f32_e32 321; GCN: s_endpgm 322define amdgpu_kernel void @select_v2f16_imm_d( 323 <2 x half> addrspace(1)* %r, 324 <2 x half> addrspace(1)* %a, 325 <2 x half> addrspace(1)* %b, 326 <2 x half> addrspace(1)* %c) { 327entry: 328 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 329 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 330 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 331 %fcmp = fcmp olt <2 x half> %a.val, %b.val 332 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900> 333 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 334 ret void 335} 336