1; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4; half args should be promoted to float 5 6; GCN-LABEL: {{^}}load_f16_arg: 7; GCN: s_load_dword [[ARG:s[0-9]+]] 8; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] 9; GCN: buffer_store_short [[CVT]] 10define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 11 store half %arg, half addrspace(1)* %out 12 ret void 13} 14 15; GCN-LABEL: {{^}}load_v2f16_arg: 16; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 17; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 18; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 19; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} 20; GCN: s_endpgm 21define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 22 store <2 x half> %arg, <2 x half> addrspace(1)* %out 23 ret void 24} 25 26; GCN-LABEL: {{^}}load_v3f16_arg: 27; GCN: buffer_load_ushort 28; GCN: buffer_load_ushort 29; GCN: buffer_load_ushort 30; GCN-NOT: buffer_load 31; GCN-DAG: buffer_store_dword 32; GCN-DAG: buffer_store_short 33; GCN-NOT: buffer_store 34; GCN: s_endpgm 35define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 36 store <3 x half> %arg, <3 x half> addrspace(1)* %out 37 ret void 38} 39 40; GCN-LABEL: {{^}}load_v4f16_arg: 41; GCN: buffer_load_ushort 42; GCN: buffer_load_ushort 43; GCN: buffer_load_ushort 44; GCN: buffer_load_ushort 45; GCN: buffer_store_short 46; GCN: buffer_store_short 47; GCN: buffer_store_short 48; GCN: buffer_store_short 49; GCN: s_endpgm 50define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 51 store <4 x half> %arg, <4 x half> addrspace(1)* %out 52 ret void 53} 54 55; GCN-LABEL: {{^}}load_v8f16_arg: 56define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 57 store <8 x half> %arg, <8 x half> addrspace(1)* %out 58 ret void 59} 60 61; GCN-LABEL: {{^}}extload_v2f16_arg: 62define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 63 %fpext = fpext <2 x half> %in to <2 x float> 64 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 65 ret void 66} 67 68; GCN-LABEL: {{^}}extload_f16_to_f32_arg: 69define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 70 %ext = fpext half %arg to float 71 store float %ext, float addrspace(1)* %out 72 ret void 73} 74 75; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: 76define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 77 %ext = fpext <2 x half> %arg to <2 x float> 78 store <2 x float> %ext, <2 x float> addrspace(1)* %out 79 ret void 80} 81 82; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: 83; GCN: buffer_load_ushort 84; GCN: buffer_load_ushort 85; GCN: buffer_load_ushort 86; GCN-NOT: buffer_load 87; GCN: v_cvt_f32_f16_e32 88; GCN: v_cvt_f32_f16_e32 89; GCN: v_cvt_f32_f16_e32 90; GCN-NOT: v_cvt_f32_f16 91; GCN-DAG: buffer_store_dword 92; GCN-DAG: buffer_store_dwordx2 93; GCN: s_endpgm 94define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 95 %ext = fpext <3 x half> %arg to <3 x float> 96 store <3 x float> %ext, <3 x float> addrspace(1)* %out 97 ret void 98} 99 100; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: 101define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 102 %ext = fpext <4 x half> %arg to <4 x float> 103 store <4 x float> %ext, <4 x float> addrspace(1)* %out 104 ret void 105} 106 107; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: 108; GCN: buffer_load_ushort 109; GCN: buffer_load_ushort 110; GCN: buffer_load_ushort 111; GCN: buffer_load_ushort 112; GCN: buffer_load_ushort 113; GCN: buffer_load_ushort 114; GCN: buffer_load_ushort 115; GCN: buffer_load_ushort 116 117; GCN: v_cvt_f32_f16_e32 118; GCN: v_cvt_f32_f16_e32 119; GCN: v_cvt_f32_f16_e32 120; GCN: v_cvt_f32_f16_e32 121; GCN: v_cvt_f32_f16_e32 122; GCN: v_cvt_f32_f16_e32 123; GCN: v_cvt_f32_f16_e32 124; GCN: v_cvt_f32_f16_e32 125 126; GCN: buffer_store_dwordx4 127; GCN: buffer_store_dwordx4 128define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 129 %ext = fpext <8 x half> %arg to <8 x float> 130 store <8 x float> %ext, <8 x float> addrspace(1)* %out 131 ret void 132} 133 134; GCN-LABEL: {{^}}extload_f16_to_f64_arg: 135; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} 136; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}} 137; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] 138; GCN: buffer_store_dwordx2 [[RESULT]] 139define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 140 %ext = fpext half %arg to double 141 store double %ext, double addrspace(1)* %out 142 ret void 143} 144 145; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: 146; GCN-DAG: buffer_load_ushort v 147; GCN-DAG: buffer_load_ushort v 148; GCN-DAG: v_cvt_f32_f16_e32 149; GCN-DAG: v_cvt_f32_f16_e32 150; GCN-DAG: v_cvt_f64_f32_e32 151; GCN-DAG: v_cvt_f64_f32_e32 152; GCN: s_endpgm 153define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 154 %ext = fpext <2 x half> %arg to <2 x double> 155 store <2 x double> %ext, <2 x double> addrspace(1)* %out 156 ret void 157} 158 159; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: 160; GCN-DAG: buffer_load_ushort v 161; GCN-DAG: buffer_load_ushort v 162; GCN-DAG: buffer_load_ushort v 163; GCN-DAG: v_cvt_f32_f16_e32 164; GCN-DAG: v_cvt_f32_f16_e32 165; GCN-DAG: v_cvt_f32_f16_e32 166; GCN-DAG: v_cvt_f64_f32_e32 167; GCN-DAG: v_cvt_f64_f32_e32 168; GCN-DAG: v_cvt_f64_f32_e32 169; GCN: s_endpgm 170define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 171 %ext = fpext <3 x half> %arg to <3 x double> 172 store <3 x double> %ext, <3 x double> addrspace(1)* %out 173 ret void 174} 175 176; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: 177; GCN-DAG: buffer_load_ushort v 178; GCN-DAG: buffer_load_ushort v 179; GCN-DAG: buffer_load_ushort v 180; GCN-DAG: buffer_load_ushort v 181; GCN-DAG: v_cvt_f32_f16_e32 182; GCN-DAG: v_cvt_f32_f16_e32 183; GCN-DAG: v_cvt_f32_f16_e32 184; GCN-DAG: v_cvt_f32_f16_e32 185; GCN-DAG: v_cvt_f64_f32_e32 186; GCN-DAG: v_cvt_f64_f32_e32 187; GCN-DAG: v_cvt_f64_f32_e32 188; GCN-DAG: v_cvt_f64_f32_e32 189; GCN: s_endpgm 190define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 191 %ext = fpext <4 x half> %arg to <4 x double> 192 store <4 x double> %ext, <4 x double> addrspace(1)* %out 193 ret void 194} 195 196; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: 197; GCN-DAG: buffer_load_ushort v 198; GCN-DAG: buffer_load_ushort v 199; GCN-DAG: buffer_load_ushort v 200; GCN-DAG: buffer_load_ushort v 201 202; GCN-DAG: buffer_load_ushort v 203; GCN-DAG: buffer_load_ushort v 204; GCN-DAG: buffer_load_ushort v 205; GCN-DAG: buffer_load_ushort v 206 207; GCN-DAG: v_cvt_f32_f16_e32 208; GCN-DAG: v_cvt_f32_f16_e32 209; GCN-DAG: v_cvt_f32_f16_e32 210; GCN-DAG: v_cvt_f32_f16_e32 211 212; GCN-DAG: v_cvt_f32_f16_e32 213; GCN-DAG: v_cvt_f32_f16_e32 214; GCN-DAG: v_cvt_f32_f16_e32 215; GCN-DAG: v_cvt_f32_f16_e32 216 217; GCN-DAG: v_cvt_f64_f32_e32 218; GCN-DAG: v_cvt_f64_f32_e32 219; GCN-DAG: v_cvt_f64_f32_e32 220; GCN-DAG: v_cvt_f64_f32_e32 221 222; GCN-DAG: v_cvt_f64_f32_e32 223; GCN-DAG: v_cvt_f64_f32_e32 224; GCN-DAG: v_cvt_f64_f32_e32 225; GCN-DAG: v_cvt_f64_f32_e32 226 227; GCN: s_endpgm 228define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 229 %ext = fpext <8 x half> %arg to <8 x double> 230 store <8 x double> %ext, <8 x double> addrspace(1)* %out 231 ret void 232} 233 234; GCN-LABEL: {{^}}global_load_store_f16: 235; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 236; GCN: buffer_store_short [[TMP]] 237define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 238 %val = load half, half addrspace(1)* %in 239 store half %val, half addrspace(1)* %out 240 ret void 241} 242 243; GCN-LABEL: {{^}}global_load_store_v2f16: 244; GCN: buffer_load_dword [[TMP:v[0-9]+]] 245; GCN: buffer_store_dword [[TMP]] 246define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 247 %val = load <2 x half>, <2 x half> addrspace(1)* %in 248 store <2 x half> %val, <2 x half> addrspace(1)* %out 249 ret void 250} 251 252; GCN-LABEL: {{^}}global_load_store_v4f16: 253; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] 254; GCN: buffer_store_dwordx2 [[TMP]] 255define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 256 %val = load <4 x half>, <4 x half> addrspace(1)* %in 257 store <4 x half> %val, <4 x half> addrspace(1)* %out 258 ret void 259} 260 261; GCN-LABEL: {{^}}global_load_store_v8f16: 262; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 263; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 264; GCN: s_endpgm 265define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 266 %val = load <8 x half>, <8 x half> addrspace(1)* %in 267 store <8 x half> %val, <8 x half> addrspace(1)* %out 268 ret void 269} 270 271; GCN-LABEL: {{^}}global_extload_f16_to_f32: 272; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 273; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] 274; GCN: buffer_store_dword [[CVT]] 275define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 276 %val = load half, half addrspace(1)* %in 277 %cvt = fpext half %val to float 278 store float %cvt, float addrspace(1)* %out 279 ret void 280} 281 282; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: 283; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 284; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} 285; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] 286; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] 287; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} 288; GCN: s_endpgm 289define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 290 %val = load <2 x half>, <2 x half> addrspace(1)* %in 291 %cvt = fpext <2 x half> %val to <2 x float> 292 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 293 ret void 294} 295 296; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: 297define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 298 %val = load <3 x half>, <3 x half> addrspace(1)* %in 299 %cvt = fpext <3 x half> %val to <3 x float> 300 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 301 ret void 302} 303 304; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: 305define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 306 %val = load <4 x half>, <4 x half> addrspace(1)* %in 307 %cvt = fpext <4 x half> %val to <4 x float> 308 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 309 ret void 310} 311 312; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: 313define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 314 %val = load <8 x half>, <8 x half> addrspace(1)* %in 315 %cvt = fpext <8 x half> %val to <8 x float> 316 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 317 ret void 318} 319 320; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: 321; GCN: buffer_load_ushort 322; GCN: buffer_load_ushort 323; GCN: buffer_load_ushort 324; GCN: buffer_load_ushort 325; GCN: buffer_load_ushort 326; GCN: buffer_load_ushort 327; GCN: buffer_load_ushort 328; GCN: buffer_load_ushort 329; GCN: buffer_load_ushort 330; GCN: buffer_load_ushort 331; GCN: buffer_load_ushort 332; GCN: buffer_load_ushort 333; GCN: buffer_load_ushort 334; GCN: buffer_load_ushort 335; GCN: buffer_load_ushort 336; GCN: buffer_load_ushort 337 338; GCN: v_cvt_f32_f16_e32 339; GCN: v_cvt_f32_f16_e32 340; GCN: v_cvt_f32_f16_e32 341; GCN: v_cvt_f32_f16_e32 342; GCN: v_cvt_f32_f16_e32 343; GCN: v_cvt_f32_f16_e32 344; GCN: v_cvt_f32_f16_e32 345; GCN: v_cvt_f32_f16_e32 346; GCN: v_cvt_f32_f16_e32 347; GCN: v_cvt_f32_f16_e32 348; GCN: v_cvt_f32_f16_e32 349; GCN: v_cvt_f32_f16_e32 350; GCN: v_cvt_f32_f16_e32 351; GCN: v_cvt_f32_f16_e32 352; GCN: v_cvt_f32_f16_e32 353; GCN: v_cvt_f32_f16_e32 354 355; GCN: buffer_store_dwordx4 356; GCN: buffer_store_dwordx4 357; GCN: buffer_store_dwordx4 358; GCN: buffer_store_dwordx4 359 360; GCN: s_endpgm 361define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 362 %val = load <16 x half>, <16 x half> addrspace(1)* %in 363 %cvt = fpext <16 x half> %val to <16 x float> 364 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 365 ret void 366} 367 368; GCN-LABEL: {{^}}global_extload_f16_to_f64: 369; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 370; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] 371; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] 372; GCN: buffer_store_dwordx2 [[CVT1]] 373define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 374 %val = load half, half addrspace(1)* %in 375 %cvt = fpext half %val to double 376 store double %cvt, double addrspace(1)* %out 377 ret void 378} 379 380; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: 381; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 382; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} 383; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] 384; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] 385; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] 386; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] 387; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} 388; GCN: s_endpgm 389define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 390 %val = load <2 x half>, <2 x half> addrspace(1)* %in 391 %cvt = fpext <2 x half> %val to <2 x double> 392 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 393 ret void 394} 395 396; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: 397 398; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 399; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]], 32 400; VI: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 32, [[LOAD]] 401; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} 402 403; GCN: v_cvt_f32_f16_e32 404; GCN: v_cvt_f32_f16_e32 405; GCN: v_cvt_f32_f16_e32 406; GCN-NOT: v_cvt_f32_f16_e32 407 408; GCN: v_cvt_f64_f32_e32 409; GCN: v_cvt_f64_f32_e32 410; GCN: v_cvt_f64_f32_e32 411; GCN-NOT: v_cvt_f64_f32_e32 412 413; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 414; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 415; GCN: s_endpgm 416define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 417 %val = load <3 x half>, <3 x half> addrspace(1)* %in 418 %cvt = fpext <3 x half> %val to <3 x double> 419 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 420 ret void 421} 422 423; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: 424define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 425 %val = load <4 x half>, <4 x half> addrspace(1)* %in 426 %cvt = fpext <4 x half> %val to <4 x double> 427 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 428 ret void 429} 430 431; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: 432define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 433 %val = load <8 x half>, <8 x half> addrspace(1)* %in 434 %cvt = fpext <8 x half> %val to <8 x double> 435 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 436 ret void 437} 438 439; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: 440define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 441 %val = load <16 x half>, <16 x half> addrspace(1)* %in 442 %cvt = fpext <16 x half> %val to <16 x double> 443 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 444 ret void 445} 446 447; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: 448; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 449; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] 450; GCN: buffer_store_short [[CVT]] 451define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 452 %val = load float, float addrspace(1)* %in 453 %cvt = fptrunc float %val to half 454 store half %cvt, half addrspace(1)* %out 455 ret void 456} 457 458; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: 459; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} 460; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] 461; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] 462; GCN-DAG: buffer_store_short [[CVT0]] 463; GCN-DAG: buffer_store_short [[CVT1]] 464; GCN: s_endpgm 465define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 466 %val = load <2 x float>, <2 x float> addrspace(1)* %in 467 %cvt = fptrunc <2 x float> %val to <2 x half> 468 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 469 ret void 470} 471 472; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: 473; GCN: buffer_load_dwordx4 474; GCN: v_cvt_f16_f32_e32 475; GCN: v_cvt_f16_f32_e32 476; GCN: v_cvt_f16_f32_e32 477; GCN-NOT: v_cvt_f16_f32_e32 478; GCN: buffer_store_short 479; GCN: buffer_store_dword 480; GCN: s_endpgm 481define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 482 %val = load <3 x float>, <3 x float> addrspace(1)* %in 483 %cvt = fptrunc <3 x float> %val to <3 x half> 484 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 485 ret void 486} 487 488; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: 489; GCN: buffer_load_dwordx4 490; GCN: v_cvt_f16_f32_e32 491; GCN: v_cvt_f16_f32_e32 492; GCN: v_cvt_f16_f32_e32 493; GCN: v_cvt_f16_f32_e32 494; GCN: buffer_store_short 495; GCN: buffer_store_short 496; GCN: buffer_store_short 497; GCN: buffer_store_short 498; GCN: s_endpgm 499define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 500 %val = load <4 x float>, <4 x float> addrspace(1)* %in 501 %cvt = fptrunc <4 x float> %val to <4 x half> 502 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 503 ret void 504} 505 506; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: 507; GCN: buffer_load_dwordx4 508; GCN: buffer_load_dwordx4 509; GCN: v_cvt_f16_f32_e32 510; GCN: v_cvt_f16_f32_e32 511; GCN: v_cvt_f16_f32_e32 512; GCN: v_cvt_f16_f32_e32 513; GCN: v_cvt_f16_f32_e32 514; GCN: v_cvt_f16_f32_e32 515; GCN: v_cvt_f16_f32_e32 516; GCN: v_cvt_f16_f32_e32 517; GCN: buffer_store_short 518; GCN: buffer_store_short 519; GCN: buffer_store_short 520; GCN: buffer_store_short 521; GCN: buffer_store_short 522; GCN: buffer_store_short 523; GCN: buffer_store_short 524; GCN: buffer_store_short 525; GCN: s_endpgm 526define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 527 %val = load <8 x float>, <8 x float> addrspace(1)* %in 528 %cvt = fptrunc <8 x float> %val to <8 x half> 529 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 530 ret void 531} 532 533; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: 534; GCN: buffer_load_dwordx4 535; GCN: buffer_load_dwordx4 536; GCN: buffer_load_dwordx4 537; GCN: buffer_load_dwordx4 538; GCN-DAG: v_cvt_f16_f32_e32 539; GCN-DAG: v_cvt_f16_f32_e32 540; GCN-DAG: v_cvt_f16_f32_e32 541; GCN-DAG: v_cvt_f16_f32_e32 542; GCN-DAG: v_cvt_f16_f32_e32 543; GCN-DAG: v_cvt_f16_f32_e32 544; GCN-DAG: v_cvt_f16_f32_e32 545; GCN-DAG: v_cvt_f16_f32_e32 546; GCN-DAG: v_cvt_f16_f32_e32 547; GCN-DAG: v_cvt_f16_f32_e32 548; GCN-DAG: v_cvt_f16_f32_e32 549; GCN-DAG: v_cvt_f16_f32_e32 550; GCN-DAG: v_cvt_f16_f32_e32 551; GCN-DAG: v_cvt_f16_f32_e32 552; GCN-DAG: v_cvt_f16_f32_e32 553; GCN-DAG: v_cvt_f16_f32_e32 554; GCN-DAG: buffer_store_short 555; GCN-DAG: buffer_store_short 556; GCN-DAG: buffer_store_short 557; GCN-DAG: buffer_store_short 558; GCN-DAG: buffer_store_short 559; GCN-DAG: buffer_store_short 560; GCN-DAG: buffer_store_short 561; GCN-DAG: buffer_store_short 562; GCN-DAG: buffer_store_short 563; GCN-DAG: buffer_store_short 564; GCN-DAG: buffer_store_short 565; GCN-DAG: buffer_store_short 566; GCN-DAG: buffer_store_short 567; GCN-DAG: buffer_store_short 568; GCN-DAG: buffer_store_short 569; GCN-DAG: buffer_store_short 570; GCN: s_endpgm 571define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 572 %val = load <16 x float>, <16 x float> addrspace(1)* %in 573 %cvt = fptrunc <16 x float> %val to <16 x half> 574 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 575 ret void 576} 577 578; FIXME: Unsafe math should fold conversions away 579; GCN-LABEL: {{^}}fadd_f16: 580; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 581; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 582; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 583; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 584; SI: v_add_f32 585; GCN: s_endpgm 586define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 587 %add = fadd half %a, %b 588 store half %add, half addrspace(1)* %out, align 4 589 ret void 590} 591 592; GCN-LABEL: {{^}}fadd_v2f16: 593; SI: v_add_f32 594; SI: v_add_f32 595; GCN: s_endpgm 596define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 597 %add = fadd <2 x half> %a, %b 598 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 599 ret void 600} 601 602; GCN-LABEL: {{^}}fadd_v4f16: 603; SI: v_add_f32 604; SI: v_add_f32 605; SI: v_add_f32 606; SI: v_add_f32 607; GCN: s_endpgm 608define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 609 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 610 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 611 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 612 %result = fadd <4 x half> %a, %b 613 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 614 ret void 615} 616 617; GCN-LABEL: {{^}}fadd_v8f16: 618; SI: v_add_f32 619; SI: v_add_f32 620; SI: v_add_f32 621; SI: v_add_f32 622; SI: v_add_f32 623; SI: v_add_f32 624; SI: v_add_f32 625; SI: v_add_f32 626; GCN: s_endpgm 627define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 628 %add = fadd <8 x half> %a, %b 629 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 630 ret void 631} 632 633; GCN-LABEL: {{^}}fsub_f16: 634; GCN: v_subrev_f32_e32 635; GCN: s_endpgm 636define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 637 %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 638 %a = load half, half addrspace(1)* %in 639 %b = load half, half addrspace(1)* %b_ptr 640 %sub = fsub half %a, %b 641 store half %sub, half addrspace(1)* %out 642 ret void 643} 644 645; GCN-LABEL: {{^}}test_bitcast_from_half: 646; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 647; GCN: buffer_store_short [[TMP]] 648define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 649 %val = load half, half addrspace(1)* %in 650 %val_int = bitcast half %val to i16 651 store i16 %val_int, i16 addrspace(1)* %out 652 ret void 653} 654 655; GCN-LABEL: {{^}}test_bitcast_to_half: 656; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 657; GCN: buffer_store_short [[TMP]] 658define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 659 %val = load i16, i16 addrspace(1)* %in 660 %val_fp = bitcast i16 %val to half 661 store half %val_fp, half addrspace(1)* %out 662 ret void 663} 664 665attributes #0 = { nounwind } 666