1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3 4define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 5; GFX9-LABEL: shuffle_v4f16_23uu: 6; GFX9: ; %bb.0: 7; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 9; GFX9-NEXT: s_waitcnt vmcnt(0) 10; GFX9-NEXT: s_setpc_b64 s[30:31] 11 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 12 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 13 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 14 ret <4 x half> %shuffle 15} 16 17define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 18; GFX9-LABEL: shuffle_v4f16_234u: 19; GFX9: ; %bb.0: 20; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 22; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 23; GFX9-NEXT: s_waitcnt vmcnt(1) 24; GFX9-NEXT: v_mov_b32_e32 v1, v2 25; GFX9-NEXT: s_waitcnt vmcnt(0) 26; GFX9-NEXT: s_setpc_b64 s[30:31] 27 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 28 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 29 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef> 30 ret <4 x half> %shuffle 31} 32 33define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 34; GFX9-LABEL: shuffle_v4f16_u1u3: 35; GFX9: ; %bb.0: 36; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 38; GFX9-NEXT: s_waitcnt vmcnt(0) 39; GFX9-NEXT: s_setpc_b64 s[30:31] 40 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 41 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 42 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3> 43 ret <4 x half> %shuffle 44} 45 46define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 47; GFX9-LABEL: shuffle_v4f16_u3u1: 48; GFX9: ; %bb.0: 49; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 51; GFX9-NEXT: s_waitcnt vmcnt(0) 52; GFX9-NEXT: v_mov_b32_e32 v0, v2 53; GFX9-NEXT: s_setpc_b64 s[30:31] 54 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 55 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 56 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1> 57 ret <4 x half> %shuffle 58} 59 60define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 61; GFX9-LABEL: shuffle_v4f16_u3uu: 62; GFX9: ; %bb.0: 63; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 64; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 65; GFX9-NEXT: s_waitcnt vmcnt(0) 66; GFX9-NEXT: s_setpc_b64 s[30:31] 67 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 68 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 69 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 70 ret <4 x half> %shuffle 71} 72 73define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 74; GFX9-LABEL: shuffle_v4f16_3u6u: 75; GFX9: ; %bb.0: 76; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 78; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 79; GFX9-NEXT: s_waitcnt vmcnt(1) 80; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 81; GFX9-NEXT: s_waitcnt vmcnt(0) 82; GFX9-NEXT: s_setpc_b64 s[30:31] 83 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 84 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 85 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef> 86 ret <4 x half> %shuffle 87} 88 89define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 90; GFX9-LABEL: shuffle_v4f16_3uu7: 91; GFX9: ; %bb.0: 92; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 93; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 94; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 95; GFX9-NEXT: s_waitcnt vmcnt(1) 96; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 97; GFX9-NEXT: s_waitcnt vmcnt(0) 98; GFX9-NEXT: s_setpc_b64 s[30:31] 99 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 100 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 101 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7> 102 ret <4 x half> %shuffle 103} 104 105define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 106; GFX9-LABEL: shuffle_v4f16_35u5: 107; GFX9: ; %bb.0: 108; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 109; GFX9-NEXT: global_load_dword v2, v[2:3], off 110; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 111; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 112; GFX9-NEXT: s_waitcnt vmcnt(1) 113; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 114; GFX9-NEXT: s_waitcnt vmcnt(0) 115; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 116; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 117; GFX9-NEXT: v_mov_b32_e32 v1, v2 118; GFX9-NEXT: s_setpc_b64 s[30:31] 119 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 120 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 121 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5> 122 ret <4 x half> %shuffle 123} 124 125define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 126; GFX9-LABEL: shuffle_v4f16_357u: 127; GFX9: ; %bb.0: 128; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 129; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 130; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 131; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 132; GFX9-NEXT: s_waitcnt vmcnt(1) 133; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 134; GFX9-NEXT: s_waitcnt vmcnt(0) 135; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 136; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 137; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 138; GFX9-NEXT: s_setpc_b64 s[30:31] 139 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 140 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 141 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef> 142 ret <4 x half> %shuffle 143} 144 145define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 146; GFX9-LABEL: shuffle_v4f16_0101: 147; GFX9: ; %bb.0: 148; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GFX9-NEXT: global_load_dword v0, v[0:1], off 150; GFX9-NEXT: s_waitcnt vmcnt(0) 151; GFX9-NEXT: v_mov_b32_e32 v1, v0 152; GFX9-NEXT: s_setpc_b64 s[30:31] 153 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 154 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 155 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 156 ret <4 x half> %shuffle 157} 158 159define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 160; GFX9-LABEL: shuffle_v4f16_0123: 161; GFX9: ; %bb.0: 162; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 163; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 164; GFX9-NEXT: s_waitcnt vmcnt(0) 165; GFX9-NEXT: s_setpc_b64 s[30:31] 166 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 167 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 168 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 169 ret <4 x half> %shuffle 170} 171 172define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 173; GFX9-LABEL: shuffle_v4f16_0145: 174; GFX9: ; %bb.0: 175; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 176; GFX9-NEXT: global_load_dword v0, v[0:1], off 177; GFX9-NEXT: global_load_dword v1, v[2:3], off 178; GFX9-NEXT: s_waitcnt vmcnt(0) 179; GFX9-NEXT: s_setpc_b64 s[30:31] 180 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 181 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 182 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 183 ret <4 x half> %shuffle 184} 185 186define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 187; GFX9-LABEL: shuffle_v4f16_0167: 188; GFX9: ; %bb.0: 189; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX9-NEXT: global_load_dword v0, v[0:1], off 191; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 192; GFX9-NEXT: s_waitcnt vmcnt(0) 193; GFX9-NEXT: s_setpc_b64 s[30:31] 194 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 195 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 196 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 197 ret <4 x half> %shuffle 198} 199 200define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 201; GFX9-LABEL: shuffle_v4f16_2301: 202; GFX9: ; %bb.0: 203; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 204; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 205; GFX9-NEXT: s_waitcnt vmcnt(0) 206; GFX9-NEXT: v_mov_b32_e32 v0, v2 207; GFX9-NEXT: s_setpc_b64 s[30:31] 208 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 209 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 210 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 211 ret <4 x half> %shuffle 212} 213 214define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 215; GFX9-LABEL: shuffle_v4f16_2323: 216; GFX9: ; %bb.0: 217; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 218; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 219; GFX9-NEXT: s_waitcnt vmcnt(0) 220; GFX9-NEXT: v_mov_b32_e32 v1, v0 221; GFX9-NEXT: s_setpc_b64 s[30:31] 222 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 223 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 224 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 225 ret <4 x half> %shuffle 226} 227 228define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 229; GFX9-LABEL: shuffle_v4f16_2345: 230; GFX9: ; %bb.0: 231; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 233; GFX9-NEXT: global_load_dword v1, v[2:3], off 234; GFX9-NEXT: s_waitcnt vmcnt(0) 235; GFX9-NEXT: s_setpc_b64 s[30:31] 236 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 237 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 238 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 239 ret <4 x half> %shuffle 240} 241 242define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 243; GFX9-LABEL: shuffle_v4f16_2367: 244; GFX9: ; %bb.0: 245; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 247; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 248; GFX9-NEXT: s_waitcnt vmcnt(0) 249; GFX9-NEXT: s_setpc_b64 s[30:31] 250 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 251 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 252 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 253 ret <4 x half> %shuffle 254} 255 256define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 257; GFX9-LABEL: shuffle_v4f16_4501: 258; GFX9: ; %bb.0: 259; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 260; GFX9-NEXT: global_load_dword v2, v[2:3], off 261; GFX9-NEXT: global_load_dword v1, v[0:1], off 262; GFX9-NEXT: s_waitcnt vmcnt(1) 263; GFX9-NEXT: v_mov_b32_e32 v0, v2 264; GFX9-NEXT: s_waitcnt vmcnt(0) 265; GFX9-NEXT: s_setpc_b64 s[30:31] 266 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 267 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 268 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 269 ret <4 x half> %shuffle 270} 271 272define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 273; GFX9-LABEL: shuffle_v4f16_4523: 274; GFX9: ; %bb.0: 275; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX9-NEXT: global_load_dword v2, v[2:3], off 277; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 278; GFX9-NEXT: s_waitcnt vmcnt(1) 279; GFX9-NEXT: v_mov_b32_e32 v0, v2 280; GFX9-NEXT: s_waitcnt vmcnt(0) 281; GFX9-NEXT: s_setpc_b64 s[30:31] 282 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 283 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 284 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 285 ret <4 x half> %shuffle 286} 287 288define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 289; GFX9-LABEL: shuffle_v4f16_4545: 290; GFX9: ; %bb.0: 291; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; GFX9-NEXT: global_load_dword v0, v[2:3], off 293; GFX9-NEXT: s_waitcnt vmcnt(0) 294; GFX9-NEXT: v_mov_b32_e32 v1, v0 295; GFX9-NEXT: s_setpc_b64 s[30:31] 296 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 297 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 298 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5> 299 ret <4 x half> %shuffle 300} 301 302define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 303; GFX9-LABEL: shuffle_v4f16_4567: 304; GFX9: ; %bb.0: 305; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 306; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 307; GFX9-NEXT: s_waitcnt vmcnt(0) 308; GFX9-NEXT: s_setpc_b64 s[30:31] 309 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 310 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 311 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 312 ret <4 x half> %shuffle 313} 314 315define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 316; GFX9-LABEL: shuffle_v4f16_6701: 317; GFX9: ; %bb.0: 318; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 320; GFX9-NEXT: global_load_dword v1, v[0:1], off 321; GFX9-NEXT: s_waitcnt vmcnt(1) 322; GFX9-NEXT: v_mov_b32_e32 v0, v2 323; GFX9-NEXT: s_waitcnt vmcnt(0) 324; GFX9-NEXT: s_setpc_b64 s[30:31] 325 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 326 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 327 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 328 ret <4 x half> %shuffle 329} 330 331define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 332; GFX9-LABEL: shuffle_v4f16_6723: 333; GFX9: ; %bb.0: 334; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 335; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 336; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 337; GFX9-NEXT: s_waitcnt vmcnt(1) 338; GFX9-NEXT: v_mov_b32_e32 v0, v2 339; GFX9-NEXT: s_waitcnt vmcnt(0) 340; GFX9-NEXT: s_setpc_b64 s[30:31] 341 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 342 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 343 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 344 ret <4 x half> %shuffle 345} 346 347define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 348; GFX9-LABEL: shuffle_v4f16_6745: 349; GFX9: ; %bb.0: 350; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 351; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 352; GFX9-NEXT: s_waitcnt vmcnt(0) 353; GFX9-NEXT: v_mov_b32_e32 v0, v2 354; GFX9-NEXT: s_setpc_b64 s[30:31] 355 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 356 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 357 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5> 358 ret <4 x half> %shuffle 359} 360 361define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 362; GFX9-LABEL: shuffle_v4f16_6767: 363; GFX9: ; %bb.0: 364; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 365; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 366; GFX9-NEXT: s_waitcnt vmcnt(0) 367; GFX9-NEXT: v_mov_b32_e32 v1, v0 368; GFX9-NEXT: s_setpc_b64 s[30:31] 369 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 370 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 371 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7> 372 ret <4 x half> %shuffle 373} 374 375define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 376; GFX9-LABEL: shuffle_v4f16_2356: 377; GFX9: ; %bb.0: 378; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 379; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 380; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 381; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 382; GFX9-NEXT: s_waitcnt vmcnt(1) 383; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 384; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 385; GFX9-NEXT: s_waitcnt vmcnt(0) 386; GFX9-NEXT: s_setpc_b64 s[30:31] 387 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 388 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 389 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 390 ret <4 x half> %shuffle 391} 392 393define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 394; GFX9-LABEL: shuffle_v4f16_5623: 395; GFX9: ; %bb.0: 396; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 397; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 398; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 399; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 400; GFX9-NEXT: s_waitcnt vmcnt(1) 401; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 402; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 403; GFX9-NEXT: s_waitcnt vmcnt(0) 404; GFX9-NEXT: s_setpc_b64 s[30:31] 405 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 406 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 407 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3> 408 ret <4 x half> %shuffle 409} 410 411define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 412; GFX9-LABEL: shuffle_v4f16_3456: 413; GFX9: ; %bb.0: 414; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 415; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 416; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 417; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 418; GFX9-NEXT: s_waitcnt vmcnt(0) 419; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 420; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 421; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 422; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 423; GFX9-NEXT: s_setpc_b64 s[30:31] 424 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 425 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 426 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 427 ret <4 x half> %shuffle 428} 429 430define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 431; GFX9-LABEL: shuffle_v4f16_5634: 432; GFX9: ; %bb.0: 433; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 434; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 435; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 436; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 437; GFX9-NEXT: s_waitcnt vmcnt(1) 438; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 439; GFX9-NEXT: s_waitcnt vmcnt(0) 440; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 441; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 442; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 443; GFX9-NEXT: s_setpc_b64 s[30:31] 444 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 445 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 446 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4> 447 ret <4 x half> %shuffle 448} 449 450define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 451; GFX9-LABEL: shuffle_v4f16_5734: 452; GFX9: ; %bb.0: 453; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 454; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 455; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 456; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 457; GFX9-NEXT: s_waitcnt vmcnt(1) 458; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 459; GFX9-NEXT: s_waitcnt vmcnt(0) 460; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 461; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 462; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 463; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 464; GFX9-NEXT: s_setpc_b64 s[30:31] 465 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 466 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 467 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4> 468 ret <4 x half> %shuffle 469} 470 471define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 472; GFX9-LABEL: shuffle_v4i16_2356: 473; GFX9: ; %bb.0: 474; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 475; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 476; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 477; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 478; GFX9-NEXT: s_waitcnt vmcnt(1) 479; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 480; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 481; GFX9-NEXT: s_waitcnt vmcnt(0) 482; GFX9-NEXT: s_setpc_b64 s[30:31] 483 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 484 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 485 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 486 ret <4 x i16> %shuffle 487} 488 489define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) { 490; GFX9-LABEL: shuffle_v4i16_0167: 491; GFX9: ; %bb.0: 492; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 493; GFX9-NEXT: global_load_dword v0, v[0:1], off 494; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 495; GFX9-NEXT: s_waitcnt vmcnt(0) 496; GFX9-NEXT: s_setpc_b64 s[30:31] 497 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 498 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 499 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 500 ret <4 x i16> %shuffle 501} 502 503define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 504; GFX9-LABEL: shuffle_v4f16_0000: 505; GFX9: ; %bb.0: 506; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 507; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 508; GFX9-NEXT: s_waitcnt vmcnt(0) 509; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 510; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 511; GFX9-NEXT: v_mov_b32_e32 v1, v0 512; GFX9-NEXT: s_setpc_b64 s[30:31] 513 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 514 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 515 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer 516 ret <4 x half> %shuffle 517} 518 519define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 520; GFX9-LABEL: shuffle_v4f16_1010: 521; GFX9: ; %bb.0: 522; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 523; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 524; GFX9-NEXT: s_waitcnt vmcnt(0) 525; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 526; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 527; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 528; GFX9-NEXT: v_mov_b32_e32 v1, v0 529; GFX9-NEXT: s_setpc_b64 s[30:31] 530 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 531 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 532 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0> 533 ret <4 x half> %shuffle 534} 535 536define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 537; GFX9-LABEL: shuffle_v4f16_1100: 538; GFX9: ; %bb.0: 539; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 540; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 541; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 542; GFX9-NEXT: s_waitcnt vmcnt(0) 543; GFX9-NEXT: v_and_b32_e32 v1, v2, v0 544; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 545; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 546; GFX9-NEXT: v_and_b32_e32 v0, v2, v3 547; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 548; GFX9-NEXT: s_setpc_b64 s[30:31] 549 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 550 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 551 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 552 ret <4 x half> %shuffle 553} 554 555define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 556; GFX9-LABEL: shuffle_v4f16_6161: 557; GFX9: ; %bb.0: 558; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 559; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 560; GFX9-NEXT: global_load_dword v0, v[0:1], off 561; GFX9-NEXT: s_waitcnt vmcnt(1) 562; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 563; GFX9-NEXT: s_waitcnt vmcnt(0) 564; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 565; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 566; GFX9-NEXT: v_mov_b32_e32 v1, v0 567; GFX9-NEXT: s_setpc_b64 s[30:31] 568 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 569 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 570 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1> 571 ret <4 x half> %shuffle 572} 573 574define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 575; GFX9-LABEL: shuffle_v4f16_2333: 576; GFX9: ; %bb.0: 577; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 578; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 579; GFX9-NEXT: s_waitcnt vmcnt(0) 580; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 581; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 582; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 583; GFX9-NEXT: s_setpc_b64 s[30:31] 584 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 585 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 586 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 587 ret <4 x half> %shuffle 588} 589 590define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 591; GFX9-LABEL: shuffle_v4f16_6667: 592; GFX9: ; %bb.0: 593; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 594; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 595; GFX9-NEXT: s_waitcnt vmcnt(0) 596; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 597; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 598; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 599; GFX9-NEXT: s_setpc_b64 s[30:31] 600 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 601 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 602 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 603 ret <4 x half> %shuffle 604} 605 606define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 607; GFX9-LABEL: shuffle_v8f16_0101: 608; GFX9: ; %bb.0: 609; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 610; GFX9-NEXT: global_load_dword v0, v[0:1], off 611; GFX9-NEXT: s_waitcnt vmcnt(0) 612; GFX9-NEXT: v_mov_b32_e32 v1, v0 613; GFX9-NEXT: s_setpc_b64 s[30:31] 614 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 615 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 616 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 617 ret <4 x half> %shuffle 618} 619 620define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 621; GFX9-LABEL: shuffle_v8f16_0123: 622; GFX9: ; %bb.0: 623; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 624; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 625; GFX9-NEXT: s_waitcnt vmcnt(0) 626; GFX9-NEXT: s_setpc_b64 s[30:31] 627 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 628 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 629 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 630 ret <4 x half> %shuffle 631} 632 633define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 634; GFX9-LABEL: shuffle_v8f16_4589: 635; GFX9: ; %bb.0: 636; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 637; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 638; GFX9-NEXT: global_load_dword v1, v[2:3], off 639; GFX9-NEXT: s_waitcnt vmcnt(0) 640; GFX9-NEXT: s_setpc_b64 s[30:31] 641 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 642 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 643 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9> 644 ret <4 x half> %shuffle 645} 646 647define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 648; GFX9-LABEL: shuffle_v8f16_10_11_2_3: 649; GFX9: ; %bb.0: 650; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 651; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 652; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 653; GFX9-NEXT: s_waitcnt vmcnt(1) 654; GFX9-NEXT: v_mov_b32_e32 v0, v2 655; GFX9-NEXT: s_waitcnt vmcnt(0) 656; GFX9-NEXT: s_setpc_b64 s[30:31] 657 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 658 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 659 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3> 660 ret <4 x half> %shuffle 661} 662 663define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) { 664; GFX9-LABEL: shuffle_v8f16_13_14_2_3: 665; GFX9: ; %bb.0: 666; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 667; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 668; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 669; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff 670; GFX9-NEXT: s_waitcnt vmcnt(1) 671; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 672; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 673; GFX9-NEXT: s_waitcnt vmcnt(0) 674; GFX9-NEXT: s_setpc_b64 s[30:31] 675 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 676 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 677 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3> 678 ret <4 x half> %shuffle 679} 680 681define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) { 682; GFX9-LABEL: shuffle_v3f16_0122: 683; GFX9: ; %bb.0: 684; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 685; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 686; GFX9-NEXT: s_waitcnt vmcnt(0) 687; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 688; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 689; GFX9-NEXT: s_setpc_b64 s[30:31] 690 %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 691 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 692 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 693 ret <4 x half> %shuffle 694} 695 696define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) { 697; GFX9-LABEL: shuffle_v2f16_0122: 698; GFX9: ; %bb.0: 699; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 700; GFX9-NEXT: global_load_dword v0, v[0:1], off 701; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 702; GFX9-NEXT: s_waitcnt vmcnt(0) 703; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 704; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 705; GFX9-NEXT: s_setpc_b64 s[30:31] 706 %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 707 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 708 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 709 ret <4 x half> %shuffle 710} 711 712define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) { 713; GFX9-LABEL: shuffle_v6f16_452367: 714; GFX9: ; %bb.0: 715; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 716; GFX9-NEXT: v_mov_b32_e32 v4, v3 717; GFX9-NEXT: v_mov_b32_e32 v3, v2 718; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off 719; GFX9-NEXT: global_load_dword v3, v[3:4], off 720; GFX9-NEXT: s_waitcnt vmcnt(1) 721; GFX9-NEXT: v_mov_b32_e32 v0, v2 722; GFX9-NEXT: s_waitcnt vmcnt(0) 723; GFX9-NEXT: v_mov_b32_e32 v2, v3 724; GFX9-NEXT: s_setpc_b64 s[30:31] 725 %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 726 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 727 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7> 728 ret <6 x half> %shuffle 729} 730 731define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) { 732; GFX9-LABEL: fma_shuffle: 733; GFX9: ; %bb.0: ; %entry 734; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 735; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 736; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 737; GFX9-NEXT: s_waitcnt lgkmcnt(0) 738; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 739; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 740; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] 741; GFX9-NEXT: s_waitcnt vmcnt(0) 742; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 743; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 744; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 745; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 746; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] 747; GFX9-NEXT: s_endpgm 748entry: 749 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() 750 %tmp12 = zext i32 %tmp1 to i64 751 %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12 752 %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8 753 %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12 754 %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8 755 %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12 756 %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8 757 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer 758 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1> 759 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1> 760 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19) 761 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1> 762 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3> 763 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20) 764 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 765 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 766 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2> 767 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3> 768 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27) 769 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3> 770 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28) 771 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 772 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 773 store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8 774 ret void 775} 776 777define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { 778; GFX9-LABEL: shuffle_v4f16_0456: 779; GFX9: ; %bb.0: 780; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 781; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 782; GFX9-NEXT: s_waitcnt vmcnt(0) 783; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 784; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 785; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 786; GFX9-NEXT: s_waitcnt vmcnt(0) 787; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 788; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 789; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v3 790; GFX9-NEXT: s_setpc_b64 s[30:31] 791 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 792 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 793 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 794 ret <4 x half> %shuffle 795} 796 797define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) { 798; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: 799; GFX9: ; %bb.0: 800; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 801; GFX9-NEXT: v_mov_b32_e32 v4, 0 802; GFX9-NEXT: s_waitcnt lgkmcnt(0) 803; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 805; GFX9-NEXT: v_mov_b32_e32 v0, s4 806; GFX9-NEXT: v_mov_b32_e32 v1, s5 807; GFX9-NEXT: v_mov_b32_e32 v2, s6 808; GFX9-NEXT: v_mov_b32_e32 v3, s7 809; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 810; GFX9-NEXT: s_endpgm 811 %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 812 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 813 store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8 814 ret void 815} 816 817declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 818declare i32 @llvm.amdgcn.workitem.id.x() #0 819 820attributes #0 = { nounwind readnone speculatable } 821