1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3
4define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
5; GFX9-LABEL: shuffle_v4f16_23uu:
6; GFX9:       ; %bb.0:
7; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
9; GFX9-NEXT:    s_waitcnt vmcnt(0)
10; GFX9-NEXT:    s_setpc_b64 s[30:31]
11  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
12  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
13  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
14  ret <4 x half> %shuffle
15}
16
17define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
18; GFX9-LABEL: shuffle_v4f16_234u:
19; GFX9:       ; %bb.0:
20; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
22; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
23; GFX9-NEXT:    s_waitcnt vmcnt(1)
24; GFX9-NEXT:    v_mov_b32_e32 v1, v2
25; GFX9-NEXT:    s_waitcnt vmcnt(0)
26; GFX9-NEXT:    s_setpc_b64 s[30:31]
27  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
28  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
29  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
30  ret <4 x half> %shuffle
31}
32
33define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
34; GFX9-LABEL: shuffle_v4f16_u1u3:
35; GFX9:       ; %bb.0:
36; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
38; GFX9-NEXT:    s_waitcnt vmcnt(0)
39; GFX9-NEXT:    s_setpc_b64 s[30:31]
40  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
41  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
42  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
43  ret <4 x half> %shuffle
44}
45
46define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
47; GFX9-LABEL: shuffle_v4f16_u3u1:
48; GFX9:       ; %bb.0:
49; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
51; GFX9-NEXT:    s_waitcnt vmcnt(0)
52; GFX9-NEXT:    v_mov_b32_e32 v0, v2
53; GFX9-NEXT:    s_setpc_b64 s[30:31]
54  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
55  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
56  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
57  ret <4 x half> %shuffle
58}
59
60define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
61; GFX9-LABEL: shuffle_v4f16_u3uu:
62; GFX9:       ; %bb.0:
63; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
65; GFX9-NEXT:    s_waitcnt vmcnt(0)
66; GFX9-NEXT:    s_setpc_b64 s[30:31]
67  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
68  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
69  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
70  ret <4 x half> %shuffle
71}
72
73define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
74; GFX9-LABEL: shuffle_v4f16_3u6u:
75; GFX9:       ; %bb.0:
76; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
78; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
79; GFX9-NEXT:    s_waitcnt vmcnt(1)
80; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
81; GFX9-NEXT:    s_waitcnt vmcnt(0)
82; GFX9-NEXT:    s_setpc_b64 s[30:31]
83  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
84  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
85  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
86  ret <4 x half> %shuffle
87}
88
89define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
90; GFX9-LABEL: shuffle_v4f16_3uu7:
91; GFX9:       ; %bb.0:
92; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
94; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
95; GFX9-NEXT:    s_waitcnt vmcnt(1)
96; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
97; GFX9-NEXT:    s_waitcnt vmcnt(0)
98; GFX9-NEXT:    s_setpc_b64 s[30:31]
99  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
100  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
101  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
102  ret <4 x half> %shuffle
103}
104
105define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
106; GFX9-LABEL: shuffle_v4f16_35u5:
107; GFX9:       ; %bb.0:
108; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109; GFX9-NEXT:    global_load_dword v2, v[2:3], off
110; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
111; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
112; GFX9-NEXT:    s_waitcnt vmcnt(1)
113; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
114; GFX9-NEXT:    s_waitcnt vmcnt(0)
115; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
116; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
117; GFX9-NEXT:    v_mov_b32_e32 v1, v2
118; GFX9-NEXT:    s_setpc_b64 s[30:31]
119  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
120  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
121  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
122  ret <4 x half> %shuffle
123}
124
125define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
126; GFX9-LABEL: shuffle_v4f16_357u:
127; GFX9:       ; %bb.0:
128; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
130; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
131; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
132; GFX9-NEXT:    s_waitcnt vmcnt(1)
133; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
134; GFX9-NEXT:    s_waitcnt vmcnt(0)
135; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
136; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
137; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
138; GFX9-NEXT:    s_setpc_b64 s[30:31]
139  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
140  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
141  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
142  ret <4 x half> %shuffle
143}
144
145define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
146; GFX9-LABEL: shuffle_v4f16_0101:
147; GFX9:       ; %bb.0:
148; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX9-NEXT:    global_load_dword v0, v[0:1], off
150; GFX9-NEXT:    s_waitcnt vmcnt(0)
151; GFX9-NEXT:    v_mov_b32_e32 v1, v0
152; GFX9-NEXT:    s_setpc_b64 s[30:31]
153  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
154  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
155  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
156  ret <4 x half> %shuffle
157}
158
159define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
160; GFX9-LABEL: shuffle_v4f16_0123:
161; GFX9:       ; %bb.0:
162; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
163; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
164; GFX9-NEXT:    s_waitcnt vmcnt(0)
165; GFX9-NEXT:    s_setpc_b64 s[30:31]
166  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
167  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
168  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
169  ret <4 x half> %shuffle
170}
171
172define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
173; GFX9-LABEL: shuffle_v4f16_0145:
174; GFX9:       ; %bb.0:
175; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176; GFX9-NEXT:    global_load_dword v0, v[0:1], off
177; GFX9-NEXT:    global_load_dword v1, v[2:3], off
178; GFX9-NEXT:    s_waitcnt vmcnt(0)
179; GFX9-NEXT:    s_setpc_b64 s[30:31]
180  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
181  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
182  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
183  ret <4 x half> %shuffle
184}
185
186define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
187; GFX9-LABEL: shuffle_v4f16_0167:
188; GFX9:       ; %bb.0:
189; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX9-NEXT:    global_load_dword v0, v[0:1], off
191; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
192; GFX9-NEXT:    s_waitcnt vmcnt(0)
193; GFX9-NEXT:    s_setpc_b64 s[30:31]
194  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
195  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
196  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
197  ret <4 x half> %shuffle
198}
199
200define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
201; GFX9-LABEL: shuffle_v4f16_2301:
202; GFX9:       ; %bb.0:
203; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
205; GFX9-NEXT:    s_waitcnt vmcnt(0)
206; GFX9-NEXT:    v_mov_b32_e32 v0, v2
207; GFX9-NEXT:    s_setpc_b64 s[30:31]
208  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
209  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
210  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
211  ret <4 x half> %shuffle
212}
213
214define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
215; GFX9-LABEL: shuffle_v4f16_2323:
216; GFX9:       ; %bb.0:
217; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
219; GFX9-NEXT:    s_waitcnt vmcnt(0)
220; GFX9-NEXT:    v_mov_b32_e32 v1, v0
221; GFX9-NEXT:    s_setpc_b64 s[30:31]
222  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
223  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
224  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
225  ret <4 x half> %shuffle
226}
227
228define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
229; GFX9-LABEL: shuffle_v4f16_2345:
230; GFX9:       ; %bb.0:
231; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
233; GFX9-NEXT:    global_load_dword v1, v[2:3], off
234; GFX9-NEXT:    s_waitcnt vmcnt(0)
235; GFX9-NEXT:    s_setpc_b64 s[30:31]
236  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
237  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
238  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
239  ret <4 x half> %shuffle
240}
241
242define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
243; GFX9-LABEL: shuffle_v4f16_2367:
244; GFX9:       ; %bb.0:
245; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
247; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
248; GFX9-NEXT:    s_waitcnt vmcnt(0)
249; GFX9-NEXT:    s_setpc_b64 s[30:31]
250  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
251  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
252  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
253  ret <4 x half> %shuffle
254}
255
256define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
257; GFX9-LABEL: shuffle_v4f16_4501:
258; GFX9:       ; %bb.0:
259; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX9-NEXT:    global_load_dword v2, v[2:3], off
261; GFX9-NEXT:    global_load_dword v1, v[0:1], off
262; GFX9-NEXT:    s_waitcnt vmcnt(1)
263; GFX9-NEXT:    v_mov_b32_e32 v0, v2
264; GFX9-NEXT:    s_waitcnt vmcnt(0)
265; GFX9-NEXT:    s_setpc_b64 s[30:31]
266  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
267  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
268  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
269  ret <4 x half> %shuffle
270}
271
272define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
273; GFX9-LABEL: shuffle_v4f16_4523:
274; GFX9:       ; %bb.0:
275; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GFX9-NEXT:    global_load_dword v2, v[2:3], off
277; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
278; GFX9-NEXT:    s_waitcnt vmcnt(1)
279; GFX9-NEXT:    v_mov_b32_e32 v0, v2
280; GFX9-NEXT:    s_waitcnt vmcnt(0)
281; GFX9-NEXT:    s_setpc_b64 s[30:31]
282  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
283  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
284  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
285  ret <4 x half> %shuffle
286}
287
288define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
289; GFX9-LABEL: shuffle_v4f16_4545:
290; GFX9:       ; %bb.0:
291; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292; GFX9-NEXT:    global_load_dword v0, v[2:3], off
293; GFX9-NEXT:    s_waitcnt vmcnt(0)
294; GFX9-NEXT:    v_mov_b32_e32 v1, v0
295; GFX9-NEXT:    s_setpc_b64 s[30:31]
296  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
297  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
298  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
299  ret <4 x half> %shuffle
300}
301
302define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
303; GFX9-LABEL: shuffle_v4f16_4567:
304; GFX9:       ; %bb.0:
305; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
307; GFX9-NEXT:    s_waitcnt vmcnt(0)
308; GFX9-NEXT:    s_setpc_b64 s[30:31]
309  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
310  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
311  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
312  ret <4 x half> %shuffle
313}
314
315define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
316; GFX9-LABEL: shuffle_v4f16_6701:
317; GFX9:       ; %bb.0:
318; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
320; GFX9-NEXT:    global_load_dword v1, v[0:1], off
321; GFX9-NEXT:    s_waitcnt vmcnt(1)
322; GFX9-NEXT:    v_mov_b32_e32 v0, v2
323; GFX9-NEXT:    s_waitcnt vmcnt(0)
324; GFX9-NEXT:    s_setpc_b64 s[30:31]
325  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
326  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
327  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
328  ret <4 x half> %shuffle
329}
330
331define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
332; GFX9-LABEL: shuffle_v4f16_6723:
333; GFX9:       ; %bb.0:
334; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
336; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
337; GFX9-NEXT:    s_waitcnt vmcnt(1)
338; GFX9-NEXT:    v_mov_b32_e32 v0, v2
339; GFX9-NEXT:    s_waitcnt vmcnt(0)
340; GFX9-NEXT:    s_setpc_b64 s[30:31]
341  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
342  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
343  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
344  ret <4 x half> %shuffle
345}
346
347define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
348; GFX9-LABEL: shuffle_v4f16_6745:
349; GFX9:       ; %bb.0:
350; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
352; GFX9-NEXT:    s_waitcnt vmcnt(0)
353; GFX9-NEXT:    v_mov_b32_e32 v0, v2
354; GFX9-NEXT:    s_setpc_b64 s[30:31]
355  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
356  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
357  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
358  ret <4 x half> %shuffle
359}
360
361define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
362; GFX9-LABEL: shuffle_v4f16_6767:
363; GFX9:       ; %bb.0:
364; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
365; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
366; GFX9-NEXT:    s_waitcnt vmcnt(0)
367; GFX9-NEXT:    v_mov_b32_e32 v1, v0
368; GFX9-NEXT:    s_setpc_b64 s[30:31]
369  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
370  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
371  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
372  ret <4 x half> %shuffle
373}
374
375define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
376; GFX9-LABEL: shuffle_v4f16_2356:
377; GFX9:       ; %bb.0:
378; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
380; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
381; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
382; GFX9-NEXT:    s_waitcnt vmcnt(1)
383; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
384; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
385; GFX9-NEXT:    s_waitcnt vmcnt(0)
386; GFX9-NEXT:    s_setpc_b64 s[30:31]
387  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
388  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
389  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
390  ret <4 x half> %shuffle
391}
392
393define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
394; GFX9-LABEL: shuffle_v4f16_5623:
395; GFX9:       ; %bb.0:
396; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
397; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
398; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
399; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
400; GFX9-NEXT:    s_waitcnt vmcnt(1)
401; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
402; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
403; GFX9-NEXT:    s_waitcnt vmcnt(0)
404; GFX9-NEXT:    s_setpc_b64 s[30:31]
405  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
406  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
407  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
408  ret <4 x half> %shuffle
409}
410
411define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
412; GFX9-LABEL: shuffle_v4f16_3456:
413; GFX9:       ; %bb.0:
414; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
416; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
417; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
418; GFX9-NEXT:    s_waitcnt vmcnt(0)
419; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
420; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
421; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
422; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
423; GFX9-NEXT:    s_setpc_b64 s[30:31]
424  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
425  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
426  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
427  ret <4 x half> %shuffle
428}
429
430define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
431; GFX9-LABEL: shuffle_v4f16_5634:
432; GFX9:       ; %bb.0:
433; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
435; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
436; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
437; GFX9-NEXT:    s_waitcnt vmcnt(1)
438; GFX9-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
439; GFX9-NEXT:    s_waitcnt vmcnt(0)
440; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
441; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v0
442; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v4
443; GFX9-NEXT:    s_setpc_b64 s[30:31]
444  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
445  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
446  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
447  ret <4 x half> %shuffle
448}
449
450define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
451; GFX9-LABEL: shuffle_v4f16_5734:
452; GFX9:       ; %bb.0:
453; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
454; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
455; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
456; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
457; GFX9-NEXT:    s_waitcnt vmcnt(1)
458; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
459; GFX9-NEXT:    s_waitcnt vmcnt(0)
460; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
461; GFX9-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
462; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v0
463; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v4
464; GFX9-NEXT:    s_setpc_b64 s[30:31]
465  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
466  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
467  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
468  ret <4 x half> %shuffle
469}
470
471define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
472; GFX9-LABEL: shuffle_v4i16_2356:
473; GFX9:       ; %bb.0:
474; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
476; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
477; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
478; GFX9-NEXT:    s_waitcnt vmcnt(1)
479; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
480; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
481; GFX9-NEXT:    s_waitcnt vmcnt(0)
482; GFX9-NEXT:    s_setpc_b64 s[30:31]
483  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
484  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
485  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
486  ret <4 x i16> %shuffle
487}
488
489define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
490; GFX9-LABEL: shuffle_v4i16_0167:
491; GFX9:       ; %bb.0:
492; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493; GFX9-NEXT:    global_load_dword v0, v[0:1], off
494; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
495; GFX9-NEXT:    s_waitcnt vmcnt(0)
496; GFX9-NEXT:    s_setpc_b64 s[30:31]
497  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
498  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
499  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
500  ret <4 x i16> %shuffle
501}
502
503define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
504; GFX9-LABEL: shuffle_v4f16_0000:
505; GFX9:       ; %bb.0:
506; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
508; GFX9-NEXT:    s_waitcnt vmcnt(0)
509; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
510; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
511; GFX9-NEXT:    v_mov_b32_e32 v1, v0
512; GFX9-NEXT:    s_setpc_b64 s[30:31]
513  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
514  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
515  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
516  ret <4 x half> %shuffle
517}
518
519define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
520; GFX9-LABEL: shuffle_v4f16_1010:
521; GFX9:       ; %bb.0:
522; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
523; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
524; GFX9-NEXT:    s_waitcnt vmcnt(0)
525; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
526; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
527; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
528; GFX9-NEXT:    v_mov_b32_e32 v1, v0
529; GFX9-NEXT:    s_setpc_b64 s[30:31]
530  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
531  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
532  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
533  ret <4 x half> %shuffle
534}
535
536define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
537; GFX9-LABEL: shuffle_v4f16_1100:
538; GFX9:       ; %bb.0:
539; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
540; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
541; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
542; GFX9-NEXT:    s_waitcnt vmcnt(0)
543; GFX9-NEXT:    v_and_b32_e32 v1, v2, v0
544; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
545; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
546; GFX9-NEXT:    v_and_b32_e32 v0, v2, v3
547; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
548; GFX9-NEXT:    s_setpc_b64 s[30:31]
549  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
550  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
551  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
552  ret <4 x half> %shuffle
553}
554
555define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
556; GFX9-LABEL: shuffle_v4f16_6161:
557; GFX9:       ; %bb.0:
558; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
559; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
560; GFX9-NEXT:    global_load_dword v0, v[0:1], off
561; GFX9-NEXT:    s_waitcnt vmcnt(1)
562; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
563; GFX9-NEXT:    s_waitcnt vmcnt(0)
564; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
565; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
566; GFX9-NEXT:    v_mov_b32_e32 v1, v0
567; GFX9-NEXT:    s_setpc_b64 s[30:31]
568  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
569  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
570  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
571  ret <4 x half> %shuffle
572}
573
574define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
575; GFX9-LABEL: shuffle_v4f16_2333:
576; GFX9:       ; %bb.0:
577; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
578; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
579; GFX9-NEXT:    s_waitcnt vmcnt(0)
580; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
581; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
582; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
583; GFX9-NEXT:    s_setpc_b64 s[30:31]
584  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
585  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
586  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
587  ret <4 x half> %shuffle
588}
589
590define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
591; GFX9-LABEL: shuffle_v4f16_6667:
592; GFX9:       ; %bb.0:
593; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
595; GFX9-NEXT:    s_waitcnt vmcnt(0)
596; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
597; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
598; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
599; GFX9-NEXT:    s_setpc_b64 s[30:31]
600  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
601  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
602  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
603  ret <4 x half> %shuffle
604}
605
606define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
607; GFX9-LABEL: shuffle_v8f16_0101:
608; GFX9:       ; %bb.0:
609; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610; GFX9-NEXT:    global_load_dword v0, v[0:1], off
611; GFX9-NEXT:    s_waitcnt vmcnt(0)
612; GFX9-NEXT:    v_mov_b32_e32 v1, v0
613; GFX9-NEXT:    s_setpc_b64 s[30:31]
614  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
615  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
616  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
617  ret <4 x half> %shuffle
618}
619
620define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
621; GFX9-LABEL: shuffle_v8f16_0123:
622; GFX9:       ; %bb.0:
623; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
624; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
625; GFX9-NEXT:    s_waitcnt vmcnt(0)
626; GFX9-NEXT:    s_setpc_b64 s[30:31]
627  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
628  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
629  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
630  ret <4 x half> %shuffle
631}
632
633define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
634; GFX9-LABEL: shuffle_v8f16_4589:
635; GFX9:       ; %bb.0:
636; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
637; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:8
638; GFX9-NEXT:    global_load_dword v1, v[2:3], off
639; GFX9-NEXT:    s_waitcnt vmcnt(0)
640; GFX9-NEXT:    s_setpc_b64 s[30:31]
641  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
642  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
643  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
644  ret <4 x half> %shuffle
645}
646
647define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
648; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
649; GFX9:       ; %bb.0:
650; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
652; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
653; GFX9-NEXT:    s_waitcnt vmcnt(1)
654; GFX9-NEXT:    v_mov_b32_e32 v0, v2
655; GFX9-NEXT:    s_waitcnt vmcnt(0)
656; GFX9-NEXT:    s_setpc_b64 s[30:31]
657  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
658  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
659  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
660  ret <4 x half> %shuffle
661}
662
663define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
664; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
665; GFX9:       ; %bb.0:
666; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
668; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
669; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
670; GFX9-NEXT:    s_waitcnt vmcnt(1)
671; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
672; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
673; GFX9-NEXT:    s_waitcnt vmcnt(0)
674; GFX9-NEXT:    s_setpc_b64 s[30:31]
675  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
676  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
677  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
678  ret <4 x half> %shuffle
679}
680
681define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
682; GFX9-LABEL: shuffle_v3f16_0122:
683; GFX9:       ; %bb.0:
684; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
685; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
686; GFX9-NEXT:    s_waitcnt vmcnt(0)
687; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
688; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
689; GFX9-NEXT:    s_setpc_b64 s[30:31]
690  %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
691  %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
692  %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
693  ret <4 x half> %shuffle
694}
695
696define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
697; GFX9-LABEL: shuffle_v2f16_0122:
698; GFX9:       ; %bb.0:
699; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
700; GFX9-NEXT:    global_load_dword v0, v[0:1], off
701; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
702; GFX9-NEXT:    s_waitcnt vmcnt(0)
703; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
704; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
705; GFX9-NEXT:    s_setpc_b64 s[30:31]
706  %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
707  %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
708  %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
709  ret <4 x half> %shuffle
710}
711
712define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
713; GFX9-LABEL: shuffle_v6f16_452367:
714; GFX9:       ; %bb.0:
715; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716; GFX9-NEXT:    v_mov_b32_e32 v4, v3
717; GFX9-NEXT:    v_mov_b32_e32 v3, v2
718; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
719; GFX9-NEXT:    global_load_dword v3, v[3:4], off
720; GFX9-NEXT:    s_waitcnt vmcnt(1)
721; GFX9-NEXT:    v_mov_b32_e32 v0, v2
722; GFX9-NEXT:    s_waitcnt vmcnt(0)
723; GFX9-NEXT:    v_mov_b32_e32 v2, v3
724; GFX9-NEXT:    s_setpc_b64 s[30:31]
725  %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
726  %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
727  %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
728  ret <6 x half> %shuffle
729}
730
731define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C)  {
732; GFX9-LABEL: fma_shuffle:
733; GFX9:       ; %bb.0: ; %entry
734; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
735; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
736; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
737; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
738; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
739; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
740; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[4:5]
741; GFX9-NEXT:    s_waitcnt vmcnt(0)
742; GFX9-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
743; GFX9-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
744; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
745; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
746; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
747; GFX9-NEXT:    s_endpgm
748entry:
749  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
750  %tmp12 = zext i32 %tmp1 to i64
751  %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
752  %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
753  %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
754  %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
755  %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
756  %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
757  %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
758  %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
759  %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
760  %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
761  %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
762  %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
763  %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
764  %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
765  %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
766  %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
767  %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
768  %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
769  %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
770  %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
771  %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
772  %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
773  store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
774  ret void
775}
776
777define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
778; GFX9-LABEL: shuffle_v4f16_0456:
779; GFX9:       ; %bb.0:
780; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
781; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
782; GFX9-NEXT:    s_waitcnt vmcnt(0)
783; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
784; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
785; GFX9-NEXT:    v_and_b32_e32 v0, v3, v0
786; GFX9-NEXT:    s_waitcnt vmcnt(0)
787; GFX9-NEXT:    v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
788; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
789; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v3
790; GFX9-NEXT:    s_setpc_b64 s[30:31]
791  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
792  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
793  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
794  ret <4 x half> %shuffle
795}
796
797define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out)  {
798; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
799; GFX9:       ; %bb.0:
800; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
801; GFX9-NEXT:    v_mov_b32_e32 v4, 0
802; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
803; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
804; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
805; GFX9-NEXT:    v_mov_b32_e32 v0, s4
806; GFX9-NEXT:    v_mov_b32_e32 v1, s5
807; GFX9-NEXT:    v_mov_b32_e32 v2, s6
808; GFX9-NEXT:    v_mov_b32_e32 v3, s7
809; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
810; GFX9-NEXT:    s_endpgm
811  %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
812  %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
813  store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8
814  ret void
815}
816
817declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
818declare i32 @llvm.amdgcn.workitem.id.x() #0
819
820attributes #0 = { nounwind readnone speculatable }
821