1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s 2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s 3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s 5 6; GCN-LABEL: {{^}}store_global_hi_v2i16: 7; GCN: s_waitcnt 8 9; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off 10 11; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12; GFX803-NEXT: flat_store_short v[0:1], v2 13; GFX906-NEXT: global_store_short v[0:1], v2, off 14 15; GCN-NEXT: s_waitcnt 16; GCN-NEXT: s_setpc_b64 17define void @store_global_hi_v2i16(i16 addrspace(1)* %out, i32 %arg) #0 { 18entry: 19 ; FIXME: ABI for pre-gfx9 20 %value = bitcast i32 %arg to <2 x i16> 21 %hi = extractelement <2 x i16> %value, i32 1 22 store i16 %hi, i16 addrspace(1)* %out 23 ret void 24} 25 26; GCN-LABEL: {{^}}store_global_hi_v2f16: 27; GCN: s_waitcnt 28 29; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off 30 31; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 32; GFX803-NEXT: flat_store_short v[0:1], v2 33; GFX906-NEXT: global_store_short v[0:1], v2, off 34 35; GCN-NEXT: s_waitcnt 36; GCN-NEXT: s_setpc_b64 37define void @store_global_hi_v2f16(half addrspace(1)* %out, i32 %arg) #0 { 38entry: 39 ; FIXME: ABI for pre-gfx9 40 %value = bitcast i32 %arg to <2 x half> 41 %hi = extractelement <2 x half> %value, i32 1 42 store half %hi, half addrspace(1)* %out 43 ret void 44} 45 46; GCN-LABEL: {{^}}store_global_hi_i32_shift: 47; GCN: s_waitcnt 48 49; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off 50 51; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 52; GFX803-NEXT: flat_store_short v[0:1], v2 53; GFX906-NEXT: global_store_short v[0:1], v2, off 54 55; GCN-NEXT: s_waitcnt 56; GCN-NEXT: s_setpc_b64 57define void @store_global_hi_i32_shift(i16 addrspace(1)* %out, i32 %value) #0 { 58entry: 59 %hi32 = lshr i32 %value, 16 60 %hi = trunc i32 %hi32 to i16 61 store i16 %hi, i16 addrspace(1)* %out 62 ret void 63} 64 65; GCN-LABEL: {{^}}store_global_hi_v2i16_i8: 66; GCN: s_waitcnt 67 68; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off 69 70; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 71; GFX803-NEXT: flat_store_byte v[0:1], v2 72; GFX906-NEXT: global_store_byte v[0:1], v2, off 73 74; GCN-NEXT: s_waitcnt 75; GCN-NEXT: s_setpc_b64 76define void @store_global_hi_v2i16_i8(i8 addrspace(1)* %out, i32 %arg) #0 { 77entry: 78 %value = bitcast i32 %arg to <2 x i16> 79 %hi = extractelement <2 x i16> %value, i32 1 80 %trunc = trunc i16 %hi to i8 81 store i8 %trunc, i8 addrspace(1)* %out 82 ret void 83} 84 85; GCN-LABEL: {{^}}store_global_hi_i8_shift: 86; GCN: s_waitcnt 87 88; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off 89 90; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 91; GFX803-NEXT: flat_store_byte v[0:1], v2 92; GFX906-NEXT: global_store_byte v[0:1], v2, off 93 94; GCN-NEXT: s_waitcnt 95; GCN-NEXT: s_setpc_b64 96define void @store_global_hi_i8_shift(i8 addrspace(1)* %out, i32 %value) #0 { 97entry: 98 %hi32 = lshr i32 %value, 16 99 %hi = trunc i32 %hi32 to i8 100 store i8 %hi, i8 addrspace(1)* %out 101 ret void 102} 103 104; GCN-LABEL: {{^}}store_global_hi_v2i16_max_offset: 105; GCN: s_waitcnt 106; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094 107 108; GFX803-DAG: v_add_u32_e32 109; GFX803-DAG: v_addc_u32_e32 110; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 111; GFX803: flat_store_short v[0:1], v2{{$}} 112 113; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 114; GFX906-NEXT: global_store_short v[0:1], v2, off 115 116; GCN-NEXT: s_waitcnt 117; GCN-NEXT: s_setpc_b64 118define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 { 119entry: 120 ; FIXME: ABI for pre-gfx9 121 %value = bitcast i32 %arg to <2 x i16> 122 %hi = extractelement <2 x i16> %value, i32 1 123 %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 2047 124 store i16 %hi, i16 addrspace(1)* %gep 125 ret void 126} 127 128; GCN-LABEL: {{^}}store_global_hi_v2i16_min_offset: 129; GCN: s_waitcnt 130; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}} 131 132; GFX803-DAG: v_add_u32_e32 133; GFX803-DAG: v_addc_u32_e32 134; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 135; GFX803: flat_store_short v[0:1], v{{[0-9]$}} 136 137; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 138; GFX906-NEXT: global_store_short v[0:1], v2, off 139 140; GCN-NEXT: s_waitcnt 141; GCN-NEXT: s_setpc_b64 142define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 { 143entry: 144 %value = bitcast i32 %arg to <2 x i16> 145 %hi = extractelement <2 x i16> %value, i32 1 146 %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 -2048 147 store i16 %hi, i16 addrspace(1)* %gep 148 ret void 149} 150 151; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_max_offset: 152; GCN: s_waitcnt 153; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095 154 155; GFX803-DAG: v_add_u32_e32 156; GFX803-DAG: v_addc_u32_e32 157; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 158; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} 159 160; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 161; GFX906-NEXT: global_store_byte v[0:1], v2, off 162 163; GCN-NEXT: s_waitcnt 164; GCN-NEXT: s_setpc_b64 165define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 { 166entry: 167 %value = bitcast i32 %arg to <2 x i16> 168 %hi = extractelement <2 x i16> %value, i32 1 169 %trunc = trunc i16 %hi to i8 170 %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4095 171 store i8 %trunc, i8 addrspace(1)* %gep 172 ret void 173} 174 175; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_min_offset: 176; GCN: s_waitcnt 177; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095 178 179; GFX803-DAG: v_add_u32_e32 180; GFX803-DAG: v_addc_u32_e32 181; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 182; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} 183 184; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 185; GFX906-NEXT: global_store_byte v[0:1], v2, off 186 187; GCN-NEXT: s_waitcnt 188; GCN-NEXT: s_setpc_b64 189define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 { 190entry: 191 %value = bitcast i32 %arg to <2 x i16> 192 %hi = extractelement <2 x i16> %value, i32 1 193 %trunc = trunc i16 %hi to i8 194 %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 -4095 195 store i8 %trunc, i8 addrspace(1)* %gep 196 ret void 197} 198 199; GCN-LABEL: {{^}}store_flat_hi_v2i16: 200; GCN: s_waitcnt 201 202; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} 203 204; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 205; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 206 207; GCN-NEXT: s_waitcnt 208; GCN-NEXT: s_setpc_b64 209define void @store_flat_hi_v2i16(i16* %out, i32 %arg) #0 { 210entry: 211 %value = bitcast i32 %arg to <2 x i16> 212 %hi = extractelement <2 x i16> %value, i32 1 213 store i16 %hi, i16* %out 214 ret void 215} 216 217; GCN-LABEL: {{^}}store_flat_hi_v2f16: 218; GCN: s_waitcnt 219 220; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} 221 222; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 223; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 224 225; GCN-NEXT: s_waitcnt 226; GCN-NEXT: s_setpc_b64 227define void @store_flat_hi_v2f16(half* %out, i32 %arg) #0 { 228entry: 229 %value = bitcast i32 %arg to <2 x half> 230 %hi = extractelement <2 x half> %value, i32 1 231 store half %hi, half* %out 232 ret void 233} 234 235; GCN-LABEL: {{^}}store_flat_hi_i32_shift: 236; GCN: s_waitcnt 237 238; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} 239 240; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 241; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 242 243; GCN-NEXT: s_waitcnt 244; GCN-NEXT: s_setpc_b64 245define void @store_flat_hi_i32_shift(i16* %out, i32 %value) #0 { 246entry: 247 %hi32 = lshr i32 %value, 16 248 %hi = trunc i32 %hi32 to i16 249 store i16 %hi, i16* %out 250 ret void 251} 252 253; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8: 254; GCN: s_waitcnt 255 256; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} 257 258; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 259; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 260 261; GCN-NEXT: s_waitcnt 262; GCN-NEXT: s_setpc_b64 263define void @store_flat_hi_v2i16_i8(i8* %out, i32 %arg) #0 { 264entry: 265 %value = bitcast i32 %arg to <2 x i16> 266 %hi = extractelement <2 x i16> %value, i32 1 267 %trunc = trunc i16 %hi to i8 268 store i8 %trunc, i8* %out 269 ret void 270} 271 272; GCN-LABEL: {{^}}store_flat_hi_i8_shift: 273; GCN: s_waitcnt 274 275; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} 276 277; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 278; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 279 280; GCN-NEXT: s_waitcnt 281; GCN-NEXT: s_setpc_b64 282define void @store_flat_hi_i8_shift(i8* %out, i32 %value) #0 { 283entry: 284 %hi32 = lshr i32 %value, 16 285 %hi = trunc i32 %hi32 to i8 286 store i8 %hi, i8* %out 287 ret void 288} 289 290; GCN-LABEL: {{^}}store_flat_hi_v2i16_max_offset: 291; GCN: s_waitcnt 292; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}} 293 294; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 295; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094 296 297; GFX803-DAG: v_add_u32_e32 298; GFX803-DAG: v_addc_u32_e32 299; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 300; GFX803: flat_store_short v[0:1], v2{{$}} 301 302; GCN-NEXT: s_waitcnt 303; GCN-NEXT: s_setpc_b64 304define void @store_flat_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 { 305entry: 306 %value = bitcast i32 %arg to <2 x i16> 307 %hi = extractelement <2 x i16> %value, i32 1 308 %gep = getelementptr inbounds i16, i16* %out, i64 2047 309 store i16 %hi, i16* %gep 310 ret void 311} 312 313; GCN-LABEL: {{^}}store_flat_hi_v2i16_neg_offset: 314; GCN: s_waitcnt 315; GFX803: v_add{{(_co)?}}_{{i|u}}32_e32 316; GFX803: v_addc_u32_e32 317 318; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff802, v 319; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v 320 321; GFX906-DAG: v_lshrrev_b32_e32 322; GFX906: flat_store_short v[0:1], v2{{$}} 323 324; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} 325; GFX803: flat_store_short v[0:1], v2{{$}} 326; GCN-NEXT: s_waitcnt 327; GCN-NEXT: s_setpc_b64 328define void @store_flat_hi_v2i16_neg_offset(i16* %out, i32 %arg) #0 { 329entry: 330 %value = bitcast i32 %arg to <2 x i16> 331 %hi = extractelement <2 x i16> %value, i32 1 332 %gep = getelementptr inbounds i16, i16* %out, i64 -1023 333 store i16 %hi, i16* %gep 334 ret void 335} 336 337; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_max_offset: 338; GCN: s_waitcnt 339; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}} 340 341; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 342; GFX803-DAG: v_add_u32_e32 343; GFX803-DAG: v_addc_u32_e32 344; GFX803: flat_store_byte v[0:1], v2{{$}} 345 346; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 347; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}} 348 349; GCN-NEXT: s_waitcnt 350; GCN-NEXT: s_setpc_b64 351define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 { 352entry: 353 %value = bitcast i32 %arg to <2 x i16> 354 %hi = extractelement <2 x i16> %value, i32 1 355 %trunc = trunc i16 %hi to i8 356 %gep = getelementptr inbounds i8, i8* %out, i64 4095 357 store i8 %trunc, i8* %gep 358 ret void 359} 360 361; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_neg_offset: 362; GCN: s_waitcnt 363 364; GFX803-DAG: v_add_u32_e32 365; GFX803-DAG: v_addc_u32_e32 366 367; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff001, v 368; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v{{[0-9]+}}, vcc 369 370; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} 371 372; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2 373; GFX906: flat_store_byte v[0:1], v2{{$}} 374 375; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 376; GFX803: flat_store_byte v[0:1], v2{{$}} 377 378; GCN-NEXT: s_waitcnt 379; GCN-NEXT: s_setpc_b64 380define void @store_flat_hi_v2i16_i8_neg_offset(i8* %out, i32 %arg) #0 { 381entry: 382 %value = bitcast i32 %arg to <2 x i16> 383 %hi = extractelement <2 x i16> %value, i32 1 384 %trunc = trunc i16 %hi to i8 385 %gep = getelementptr inbounds i8, i8* %out, i64 -4095 386 store i8 %trunc, i8* %gep 387 ret void 388} 389 390; GCN-LABEL: {{^}}store_private_hi_v2i16: 391; GCN: s_waitcnt 392 393; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} 394; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off 395 396; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 397; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} 398 399; GCN-NEXT: s_waitcnt 400; GCN-NEXT: s_setpc_b64 401define void @store_private_hi_v2i16(i16 addrspace(5)* %out, i32 %arg) #0 { 402entry: 403 ; FIXME: ABI for pre-gfx9 404 %value = bitcast i32 %arg to <2 x i16> 405 %hi = extractelement <2 x i16> %value, i32 1 406 store i16 %hi, i16 addrspace(5)* %out 407 ret void 408} 409 410; GCN-LABEL: {{^}}store_private_hi_v2f16: 411; GCN: s_waitcnt 412 413; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} 414; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} 415 416; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 417; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} 418 419; GCN-NEXT: s_waitcnt 420; GCN-NEXT: s_setpc_b64 421define void @store_private_hi_v2f16(half addrspace(5)* %out, i32 %arg) #0 { 422entry: 423 ; FIXME: ABI for pre-gfx9 424 %value = bitcast i32 %arg to <2 x half> 425 %hi = extractelement <2 x half> %value, i32 1 426 store half %hi, half addrspace(5)* %out 427 ret void 428} 429 430; GCN-LABEL: {{^}}store_private_hi_i32_shift: 431; GCN: s_waitcnt 432 433; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} 434; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} 435 436; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 437; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} 438 439; GCN-NEXT: s_waitcnt 440; GCN-NEXT: s_setpc_b64 441define void @store_private_hi_i32_shift(i16 addrspace(5)* %out, i32 %value) #0 { 442entry: 443 %hi32 = lshr i32 %value, 16 444 %hi = trunc i32 %hi32 to i16 445 store i16 %hi, i16 addrspace(5)* %out 446 ret void 447} 448 449; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: 450; GCN: s_waitcnt 451 452; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} 453; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} 454 455; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 456; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} 457 458; GCN-NEXT: s_waitcnt 459; GCN-NEXT: s_setpc_b64 460define void @store_private_hi_v2i16_i8(i8 addrspace(5)* %out, i32 %arg) #0 { 461entry: 462 %value = bitcast i32 %arg to <2 x i16> 463 %hi = extractelement <2 x i16> %value, i32 1 464 %trunc = trunc i16 %hi to i8 465 store i8 %trunc, i8 addrspace(5)* %out 466 ret void 467} 468 469; GCN-LABEL: {{^}}store_private_hi_i8_shift: 470; GCN: s_waitcnt 471 472; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} 473; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} 474 475; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 476; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} 477 478; GCN-NEXT: s_waitcnt 479; GCN-NEXT: s_setpc_b64 480define void @store_private_hi_i8_shift(i8 addrspace(5)* %out, i32 %value) #0 { 481entry: 482 %hi32 = lshr i32 %value, 16 483 %hi = trunc i32 %hi32 to i8 484 store i8 %hi, i8 addrspace(5)* %out 485 ret void 486} 487 488; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: 489; GCN: s_waitcnt 490; GFX900-MUBUF: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 491; GFX900-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}} 492 493; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 494; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} 495 496; GCN-NEXT: s_waitcnt 497; GCN-NEXT: s_setpc_b64 498define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval(i16) %out, i32 %arg) #0 { 499entry: 500 %value = bitcast i32 %arg to <2 x i16> 501 %hi = extractelement <2 x i16> %value, i32 1 502 %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2047 503 store i16 %hi, i16 addrspace(5)* %gep 504 ret void 505} 506 507 508 509; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: 510; GCN: s_waitcnt 511 512; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} 513; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 514; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}} 515 516; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 517; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}} 518 519; GCN-NEXT: s_waitcnt 520; GCN-NEXT: s_setpc_b64 521define void @store_private_hi_v2i16_nooff(i32 %arg) #0 { 522entry: 523 ; FIXME: ABI for pre-gfx9 524 %value = bitcast i32 %arg to <2 x i16> 525 %hi = extractelement <2 x i16> %value, i32 1 526 store volatile i16 %hi, i16 addrspace(5)* null 527 ret void 528} 529 530 531; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: 532; GCN: s_waitcnt 533 534; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} 535; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 536; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}} 537 538; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 539; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}} 540 541; GCN-NEXT: s_waitcnt 542; GCN-NEXT: s_setpc_b64 543define void @store_private_hi_v2i16_i8_nooff(i32 %arg) #0 { 544entry: 545 %value = bitcast i32 %arg to <2 x i16> 546 %hi = extractelement <2 x i16> %value, i32 1 547 %trunc = trunc i16 %hi to i8 548 store volatile i8 %trunc, i8 addrspace(5)* null 549 ret void 550} 551 552; GCN-LABEL: {{^}}store_local_hi_v2i16: 553; GCN: s_waitcnt 554 555; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} 556 557; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 558; NO-D16-HI: ds_write_b16 v0, v1 559 560; GCN-NEXT: s_waitcnt 561; GCN-NEXT: s_setpc_b64 562define void @store_local_hi_v2i16(i16 addrspace(3)* %out, i32 %arg) #0 { 563entry: 564 ; FIXME: ABI for pre-gfx9 565 %value = bitcast i32 %arg to <2 x i16> 566 %hi = extractelement <2 x i16> %value, i32 1 567 store i16 %hi, i16 addrspace(3)* %out 568 ret void 569} 570 571; GCN-LABEL: {{^}}store_local_hi_v2f16: 572; GCN: s_waitcnt 573 574; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} 575 576; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 577; NO-D16-HI: ds_write_b16 v0, v1 578 579; GCN-NEXT: s_waitcnt 580; GCN-NEXT: s_setpc_b64 581define void @store_local_hi_v2f16(half addrspace(3)* %out, i32 %arg) #0 { 582entry: 583 ; FIXME: ABI for pre-gfx9 584 %value = bitcast i32 %arg to <2 x half> 585 %hi = extractelement <2 x half> %value, i32 1 586 store half %hi, half addrspace(3)* %out 587 ret void 588} 589 590; GCN-LABEL: {{^}}store_local_hi_i32_shift: 591; GCN: s_waitcnt 592 593; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} 594 595; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 596; NO-D16-HI: ds_write_b16 v0, v1 597 598; GCN-NEXT: s_waitcnt 599; GCN-NEXT: s_setpc_b64 600define void @store_local_hi_i32_shift(i16 addrspace(3)* %out, i32 %value) #0 { 601entry: 602 %hi32 = lshr i32 %value, 16 603 %hi = trunc i32 %hi32 to i16 604 store i16 %hi, i16 addrspace(3)* %out 605 ret void 606} 607 608; GCN-LABEL: {{^}}store_local_hi_v2i16_i8: 609; GCN: s_waitcnt 610 611; GFX900-NEXT: ds_write_b8_d16_hi v0, v1{{$}} 612 613; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 614; NO-D16-HI: ds_write_b8 v0, v1 615 616; GCN-NEXT: s_waitcnt 617; GCN-NEXT: s_setpc_b64 618define void @store_local_hi_v2i16_i8(i8 addrspace(3)* %out, i32 %arg) #0 { 619entry: 620 %value = bitcast i32 %arg to <2 x i16> 621 %hi = extractelement <2 x i16> %value, i32 1 622 %trunc = trunc i16 %hi to i8 623 store i8 %trunc, i8 addrspace(3)* %out 624 ret void 625} 626 627; GCN-LABEL: {{^}}store_local_hi_v2i16_max_offset: 628; GCN: s_waitcnt 629; GFX900-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}} 630 631; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 632; NO-D16-HI: ds_write_b16 v0, v1 offset:65534{{$}} 633 634; GCN-NEXT: s_waitcnt 635; GCN-NEXT: s_setpc_b64 636define void @store_local_hi_v2i16_max_offset(i16 addrspace(3)* %out, i32 %arg) #0 { 637entry: 638 ; FIXME: ABI for pre-gfx9 639 %value = bitcast i32 %arg to <2 x i16> 640 %hi = extractelement <2 x i16> %value, i32 1 641 %gep = getelementptr inbounds i16, i16 addrspace(3)* %out, i64 32767 642 store i16 %hi, i16 addrspace(3)* %gep 643 ret void 644} 645 646; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: 647; GCN: s_waitcnt 648; GFX900-MUBUF: buffer_store_dword 649; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 650; GFX900-FLATSCR: scratch_store_dword 651; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094 652define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { 653entry: 654 %obj0 = alloca [10 x i32], align 4, addrspace(5) 655 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 656 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 657 store volatile i32 123, i32 addrspace(5)* %bc 658 %value = bitcast i32 %arg to <2 x i16> 659 %hi = extractelement <2 x i16> %value, i32 1 660 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 661 store i16 %hi, i16 addrspace(5)* %gep 662 ret void 663} 664 665; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: 666; GCN: s_waitcnt 667; GFX900-MUBUF: buffer_store_dword 668; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 669; GFX900-FLATSCR: scratch_store_dword 670; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095 671define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { 672entry: 673 %obj0 = alloca [10 x i32], align 4, addrspace(5) 674 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 675 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 676 store volatile i32 123, i32 addrspace(5)* %bc 677 %value = bitcast i32 %arg to <2 x i16> 678 %hi = extractelement <2 x i16> %value, i32 1 679 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 680 %trunc = trunc i16 %hi to i8 681 store i8 %trunc, i8 addrspace(5)* %gep 682 ret void 683} 684 685attributes #0 = { nounwind } 686