1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
10; GFX7-LABEL: udot8_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
14; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
15; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
16; GFX7-NEXT:    s_mov_b32 s26, -1
17; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
19; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
20; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
21; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
22; GFX7-NEXT:    s_add_u32 s24, s24, s3
23; GFX7-NEXT:    s_addc_u32 s25, s25, 0
24; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX7-NEXT:    s_lshr_b32 s7, s6, 28
26; GFX7-NEXT:    s_bfe_u32 s14, s6, 0x40018
27; GFX7-NEXT:    s_bfe_u32 s15, s6, 0x40014
28; GFX7-NEXT:    s_bfe_u32 s16, s6, 0x40010
29; GFX7-NEXT:    s_bfe_u32 s17, s6, 0x4000c
30; GFX7-NEXT:    s_bfe_u32 s18, s6, 0x40008
31; GFX7-NEXT:    s_bfe_u32 s19, s6, 0x40004
32; GFX7-NEXT:    s_and_b32 s6, s6, 15
33; GFX7-NEXT:    s_lshr_b32 s5, s4, 28
34; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40018
35; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40014
36; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40010
37; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x4000c
38; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40008
39; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x40004
40; GFX7-NEXT:    s_and_b32 s4, s4, 15
41; GFX7-NEXT:    v_mov_b32_e32 v0, s6
42; GFX7-NEXT:    v_mov_b32_e32 v1, s20
43; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v0, v1
44; GFX7-NEXT:    v_mov_b32_e32 v1, s19
45; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
46; GFX7-NEXT:    v_mov_b32_e32 v1, s18
47; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
48; GFX7-NEXT:    v_mov_b32_e32 v1, s17
49; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
50; GFX7-NEXT:    v_mov_b32_e32 v1, s16
51; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
52; GFX7-NEXT:    v_mov_b32_e32 v1, s15
53; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
54; GFX7-NEXT:    v_mov_b32_e32 v1, s14
55; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
56; GFX7-NEXT:    v_mov_b32_e32 v1, s7
57; GFX7-NEXT:    s_mov_b32 s3, 0xf000
58; GFX7-NEXT:    s_mov_b32 s2, -1
59; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
60; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
61; GFX7-NEXT:    s_endpgm
62;
63; GFX8-LABEL: udot8_acc32:
64; GFX8:       ; %bb.0: ; %entry
65; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
66; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
67; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
68; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
69; GFX8-NEXT:    s_mov_b32 s22, -1
70; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
72; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
73; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
74; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
75; GFX8-NEXT:    s_add_u32 s20, s20, s3
76; GFX8-NEXT:    s_addc_u32 s21, s21, 0
77; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
79; GFX8-NEXT:    s_bfe_u32 s12, s6, 0x40018
80; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40014
81; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40010
82; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x4000c
83; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x40008
84; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40004
85; GFX8-NEXT:    s_and_b32 s6, s6, 15
86; GFX8-NEXT:    s_lshr_b32 s3, s2, 28
87; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40018
88; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40014
89; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
90; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
91; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40008
92; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40004
93; GFX8-NEXT:    s_and_b32 s2, s2, 15
94; GFX8-NEXT:    v_mov_b32_e32 v0, s6
95; GFX8-NEXT:    v_mov_b32_e32 v1, s18
96; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
97; GFX8-NEXT:    v_mov_b32_e32 v1, s17
98; GFX8-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
99; GFX8-NEXT:    v_mov_b32_e32 v1, s16
100; GFX8-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
101; GFX8-NEXT:    v_mov_b32_e32 v1, s15
102; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
103; GFX8-NEXT:    v_mov_b32_e32 v1, s14
104; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
105; GFX8-NEXT:    v_mov_b32_e32 v1, s13
106; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
107; GFX8-NEXT:    v_mov_b32_e32 v1, s12
108; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
109; GFX8-NEXT:    v_mov_b32_e32 v1, s7
110; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
111; GFX8-NEXT:    v_mov_b32_e32 v0, s0
112; GFX8-NEXT:    v_mov_b32_e32 v1, s1
113; GFX8-NEXT:    flat_store_dword v[0:1], v2
114; GFX8-NEXT:    s_endpgm
115;
116; GFX9-LABEL: udot8_acc32:
117; GFX9:       ; %bb.0: ; %entry
118; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
119; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
120; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
121; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
122; GFX9-NEXT:    s_mov_b32 s22, -1
123; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
125; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
126; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
127; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
128; GFX9-NEXT:    s_add_u32 s20, s20, s3
129; GFX9-NEXT:    s_addc_u32 s21, s21, 0
130; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
132; GFX9-NEXT:    s_bfe_u32 s12, s6, 0x40018
133; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40014
134; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
135; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x4000c
136; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
137; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40004
138; GFX9-NEXT:    s_and_b32 s6, s6, 15
139; GFX9-NEXT:    s_lshr_b32 s3, s2, 28
140; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
141; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40014
142; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
143; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
144; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
145; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
146; GFX9-NEXT:    s_and_b32 s2, s2, 15
147; GFX9-NEXT:    v_mov_b32_e32 v1, s6
148; GFX9-NEXT:    v_mov_b32_e32 v2, s18
149; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
150; GFX9-NEXT:    v_mov_b32_e32 v2, s17
151; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
152; GFX9-NEXT:    v_mov_b32_e32 v2, s16
153; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
154; GFX9-NEXT:    v_mov_b32_e32 v2, s15
155; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
156; GFX9-NEXT:    v_mov_b32_e32 v2, s14
157; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
158; GFX9-NEXT:    v_mov_b32_e32 v2, s13
159; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
160; GFX9-NEXT:    v_mov_b32_e32 v2, s12
161; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
162; GFX9-NEXT:    v_mov_b32_e32 v2, s7
163; GFX9-NEXT:    v_mov_b32_e32 v0, 0
164; GFX9-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
165; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
166; GFX9-NEXT:    s_endpgm
167;
168; GFX9-DL-LABEL: udot8_acc32:
169; GFX9-DL:       ; %bb.0: ; %entry
170; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
171; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
172; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
173; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
174; GFX9-DL-NEXT:    s_mov_b32 s10, -1
175; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
176; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
177; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
178; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
179; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
180; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
181; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
182; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
183; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
185; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
186; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, s4, v1, v2
187; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
188; GFX9-DL-NEXT:    s_endpgm
189;
190; GFX10-DL-LABEL: udot8_acc32:
191; GFX10-DL:       ; %bb.0: ; %entry
192; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
193; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
194; GFX10-DL-NEXT:    s_mov_b32 s10, -1
195; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
196; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
197; GFX10-DL-NEXT:    s_clause 0x1
198; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
199; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
200; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
201; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
202; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
204; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
205; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
206; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
208; GFX10-DL-NEXT:    v_dot8_u32_u4 v0, s0, s1, v0
209; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
210; GFX10-DL-NEXT:    s_endpgm
211                                       <8 x i4> addrspace(1)* %src2,
212                                       i32 addrspace(1)* nocapture %dst) {
213entry:
214  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
215  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
216
217  %v1e0 = extractelement <8 x i4> %vec1, i64 0
218  %cv1e0 = zext i4 %v1e0 to i32
219  %v2e0 = extractelement <8 x i4> %vec2, i64 0
220  %cv2e0 = zext i4 %v2e0 to i32
221  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
222
223  %v1e1 = extractelement <8 x i4> %vec1, i64 1
224  %cv1e1 = zext i4 %v1e1 to i32
225  %v2e1 = extractelement <8 x i4> %vec2, i64 1
226  %cv2e1 = zext i4 %v2e1 to i32
227  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
228
229  %v1e2 = extractelement <8 x i4> %vec1, i64 2
230  %cv1e2 = zext i4 %v1e2 to i32
231  %v2e2 = extractelement <8 x i4> %vec2, i64 2
232  %cv2e2 = zext i4 %v2e2 to i32
233  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
234
235  %v1e3 = extractelement <8 x i4> %vec1, i64 3
236  %cv1e3 = zext i4 %v1e3 to i32
237  %v2e3 = extractelement <8 x i4> %vec2, i64 3
238  %cv2e3 = zext i4 %v2e3 to i32
239  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
240
241  %v1e4 = extractelement <8 x i4> %vec1, i64 4
242  %cv1e4 = zext i4 %v1e4 to i32
243  %v2e4 = extractelement <8 x i4> %vec2, i64 4
244  %cv2e4 = zext i4 %v2e4 to i32
245  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
246
247  %v1e5 = extractelement <8 x i4> %vec1, i64 5
248  %cv1e5 = zext i4 %v1e5 to i32
249  %v2e5 = extractelement <8 x i4> %vec2, i64 5
250  %cv2e5 = zext i4 %v2e5 to i32
251  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
252
253  %v1e6 = extractelement <8 x i4> %vec1, i64 6
254  %cv1e6 = zext i4 %v1e6 to i32
255  %v2e6 = extractelement <8 x i4> %vec2, i64 6
256  %cv2e6 = zext i4 %v2e6 to i32
257  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
258
259  %v1e7 = extractelement <8 x i4> %vec1, i64 7
260  %cv1e7 = zext i4 %v1e7 to i32
261  %v2e7 = extractelement <8 x i4> %vec2, i64 7
262  %cv2e7 = zext i4 %v2e7 to i32
263  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
264
265  %acc = load i32, i32 addrspace(1)* %dst, align 4
266  %add1 = add i32 %mul0, %acc
267  %add2 = add i32 %add1, %mul1
268  %add3 = add i32 %add2, %mul2
269  %add4 = add i32 %add3, %mul3
270  %add5 = add i32 %add4, %mul4
271  %add6 = add i32 %add5, %mul5
272  %add7 = add i32 %add6, %mul6
273  %add8 = add i32 %add7, %mul7
274
275  store i32 %add8, i32 addrspace(1)* %dst, align 4
276  ret void
277}
278
279; TODO: Remove the unnecessary instruction(that is zero-extending the
280; 2nd MAD) to have the pattern-recognizer to kick in.
281define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
282; GFX7-LABEL: udot8_acc16:
283; GFX7:       ; %bb.0: ; %entry
284; GFX7-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
285; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
286; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
287; GFX7-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
288; GFX7-NEXT:    s_mov_b32 s22, -1
289; GFX7-NEXT:    s_mov_b32 s23, 0xe8f000
290; GFX7-NEXT:    s_add_u32 s20, s20, s3
291; GFX7-NEXT:    s_mov_b32 s3, 0xf000
292; GFX7-NEXT:    s_mov_b32 s2, -1
293; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
295; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
296; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
297; GFX7-NEXT:    s_addc_u32 s21, s21, 0
298; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
300; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
301; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
302; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
303; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
304; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
305; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
306; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
307; GFX7-NEXT:    s_and_b32 s5, s5, 15
308; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
309; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
310; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
311; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
312; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
313; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
314; GFX7-NEXT:    s_and_b32 s4, s4, 15
315; GFX7-NEXT:    v_mov_b32_e32 v1, s5
316; GFX7-NEXT:    v_mov_b32_e32 v2, s19
317; GFX7-NEXT:    v_mov_b32_e32 v3, s18
318; GFX7-NEXT:    v_mov_b32_e32 v4, s17
319; GFX7-NEXT:    v_mov_b32_e32 v5, s16
320; GFX7-NEXT:    v_mov_b32_e32 v6, s15
321; GFX7-NEXT:    v_mov_b32_e32 v7, s14
322; GFX7-NEXT:    s_waitcnt vmcnt(0)
323; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
324; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
325; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
326; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
327; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
328; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
329; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
330; GFX7-NEXT:    v_mov_b32_e32 v1, s13
331; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
332; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
333; GFX7-NEXT:    s_endpgm
334;
335; GFX8-LABEL: udot8_acc16:
336; GFX8:       ; %bb.0: ; %entry
337; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
338; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
339; GFX8-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
340; GFX8-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
341; GFX8-NEXT:    s_mov_b32 s18, -1
342; GFX8-NEXT:    s_mov_b32 s19, 0xe80000
343; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
344; GFX8-NEXT:    v_mov_b32_e32 v0, s0
345; GFX8-NEXT:    v_mov_b32_e32 v1, s1
346; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
347; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
348; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
349; GFX8-NEXT:    s_add_u32 s16, s16, s3
350; GFX8-NEXT:    s_addc_u32 s17, s17, 0
351; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
353; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
354; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
355; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
356; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x4000c
357; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
358; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
359; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
360; GFX8-NEXT:    s_and_b32 s1, s1, 15
361; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
362; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
363; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
364; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x4000c
365; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
366; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
367; GFX8-NEXT:    s_and_b32 s0, s0, 15
368; GFX8-NEXT:    v_mov_b32_e32 v3, s1
369; GFX8-NEXT:    v_mov_b32_e32 v4, s15
370; GFX8-NEXT:    v_mov_b32_e32 v5, s14
371; GFX8-NEXT:    v_mov_b32_e32 v6, s13
372; GFX8-NEXT:    v_mov_b32_e32 v7, s12
373; GFX8-NEXT:    v_mov_b32_e32 v8, s11
374; GFX8-NEXT:    v_mov_b32_e32 v9, s10
375; GFX8-NEXT:    s_waitcnt vmcnt(0)
376; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
377; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
378; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
379; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
380; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
381; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
382; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
383; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
384; GFX8-NEXT:    v_mov_b32_e32 v3, s9
385; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
386; GFX8-NEXT:    flat_store_short v[0:1], v2
387; GFX8-NEXT:    s_endpgm
388;
389; GFX9-LABEL: udot8_acc16:
390; GFX9:       ; %bb.0: ; %entry
391; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
392; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
393; GFX9-NEXT:    v_mov_b32_e32 v0, 0
394; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
395; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
396; GFX9-NEXT:    s_mov_b32 s22, -1
397; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
398; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
399; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
400; GFX9-NEXT:    s_add_u32 s20, s20, s3
401; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
402; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
403; GFX9-NEXT:    s_addc_u32 s21, s21, 0
404; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
405; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
406; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
407; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
408; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
409; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x4000c
410; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40008
411; GFX9-NEXT:    s_bfe_u32 s17, s3, 0x40004
412; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
413; GFX9-NEXT:    s_and_b32 s3, s3, 15
414; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
415; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
416; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
417; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
418; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
419; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40004
420; GFX9-NEXT:    s_and_b32 s2, s2, 15
421; GFX9-NEXT:    v_mov_b32_e32 v2, s3
422; GFX9-NEXT:    v_mov_b32_e32 v3, s17
423; GFX9-NEXT:    v_mov_b32_e32 v4, s16
424; GFX9-NEXT:    v_mov_b32_e32 v5, s15
425; GFX9-NEXT:    v_mov_b32_e32 v6, s14
426; GFX9-NEXT:    v_mov_b32_e32 v7, s13
427; GFX9-NEXT:    v_mov_b32_e32 v8, s12
428; GFX9-NEXT:    s_waitcnt vmcnt(0)
429; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
430; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
431; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
432; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
433; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
434; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
435; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
436; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
437; GFX9-NEXT:    v_mov_b32_e32 v2, s11
438; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
439; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
440; GFX9-NEXT:    s_endpgm
441;
442; GFX9-DL-LABEL: udot8_acc16:
443; GFX9-DL:       ; %bb.0: ; %entry
444; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
445; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
446; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
447; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
448; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
449; GFX9-DL-NEXT:    s_mov_b32 s22, -1
450; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
452; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
453; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
454; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
455; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
456; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
457; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
458; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
459; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
460; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
461; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
462; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x4000c
463; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40008
464; GFX9-DL-NEXT:    s_bfe_u32 s17, s3, 0x40004
465; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
466; GFX9-DL-NEXT:    s_and_b32 s3, s3, 15
467; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
468; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
469; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
470; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
471; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
472; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40004
473; GFX9-DL-NEXT:    s_and_b32 s2, s2, 15
474; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
475; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
476; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
477; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
478; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
479; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
480; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
481; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
482; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
483; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
484; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
485; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
486; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
487; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
488; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
489; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
490; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
491; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
492; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
493; GFX9-DL-NEXT:    s_endpgm
494;
495; GFX10-DL-LABEL: udot8_acc16:
496; GFX10-DL:       ; %bb.0: ; %entry
497; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
498; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
499; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
500; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
501; GFX10-DL-NEXT:    s_mov_b32 s10, -1
502; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
503; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
504; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
505; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
506; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
508; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
509; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
510; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
511; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
512; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
513; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
514; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
515; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
516; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
517; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
518; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
519; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40008
520; GFX10-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
521; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
522; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x4000c
523; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x4000c
524; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
525; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
526; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
527; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
528; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
529; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
530; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
531; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
532; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
533; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
534; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
535; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
536; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
537; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
538; GFX10-DL-NEXT:    s_endpgm
539                                       <8 x i4> addrspace(1)* %src2,
540                                       i16 addrspace(1)* nocapture %dst) {
541entry:
542  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
543  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
544
545  %v1e0 = extractelement <8 x i4> %vec1, i64 0
546  %cv1e0 = zext i4 %v1e0 to i16
547  %v2e0 = extractelement <8 x i4> %vec2, i64 0
548  %cv2e0 = zext i4 %v2e0 to i16
549  %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
550
551  %v1e1 = extractelement <8 x i4> %vec1, i64 1
552  %cv1e1 = zext i4 %v1e1 to i16
553  %v2e1 = extractelement <8 x i4> %vec2, i64 1
554  %cv2e1 = zext i4 %v2e1 to i16
555  %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
556
557  %v1e2 = extractelement <8 x i4> %vec1, i64 2
558  %cv1e2 = zext i4 %v1e2 to i16
559  %v2e2 = extractelement <8 x i4> %vec2, i64 2
560  %cv2e2 = zext i4 %v2e2 to i16
561  %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
562
563  %v1e3 = extractelement <8 x i4> %vec1, i64 3
564  %cv1e3 = zext i4 %v1e3 to i16
565  %v2e3 = extractelement <8 x i4> %vec2, i64 3
566  %cv2e3 = zext i4 %v2e3 to i16
567  %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
568
569  %v1e4 = extractelement <8 x i4> %vec1, i64 4
570  %cv1e4 = zext i4 %v1e4 to i16
571  %v2e4 = extractelement <8 x i4> %vec2, i64 4
572  %cv2e4 = zext i4 %v2e4 to i16
573  %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
574
575  %v1e5 = extractelement <8 x i4> %vec1, i64 5
576  %cv1e5 = zext i4 %v1e5 to i16
577  %v2e5 = extractelement <8 x i4> %vec2, i64 5
578  %cv2e5 = zext i4 %v2e5 to i16
579  %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
580
581  %v1e6 = extractelement <8 x i4> %vec1, i64 6
582  %cv1e6 = zext i4 %v1e6 to i16
583  %v2e6 = extractelement <8 x i4> %vec2, i64 6
584  %cv2e6 = zext i4 %v2e6 to i16
585  %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
586
587  %v1e7 = extractelement <8 x i4> %vec1, i64 7
588  %cv1e7 = zext i4 %v1e7 to i16
589  %v2e7 = extractelement <8 x i4> %vec2, i64 7
590  %cv2e7 = zext i4 %v2e7 to i16
591  %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
592
593  %acc = load i16, i16 addrspace(1)* %dst, align 4
594  %add1 = add i16 %mul0, %acc
595  %add2 = add i16 %add1, %mul1
596  %add3 = add i16 %add2, %mul2
597  %add4 = add i16 %add3, %mul3
598  %add5 = add i16 %add4, %mul4
599  %add6 = add i16 %add5, %mul5
600  %add7 = add i16 %add6, %mul6
601  %add8 = add i16 %add7, %mul7
602
603  store i16 %add8, i16 addrspace(1)* %dst, align 4
604  ret void
605}
606
607; TODO: Remove the unnecessary instruction(that is zero-extending the
608; 2nd MAD) to have the pattern-recognizer to kick in.
609define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
610; GFX7-LABEL: udot8_acc8:
611; GFX7:       ; %bb.0: ; %entry
612; GFX7-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
613; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
614; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
615; GFX7-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
616; GFX7-NEXT:    s_mov_b32 s22, -1
617; GFX7-NEXT:    s_mov_b32 s23, 0xe8f000
618; GFX7-NEXT:    s_add_u32 s20, s20, s3
619; GFX7-NEXT:    s_mov_b32 s3, 0xf000
620; GFX7-NEXT:    s_mov_b32 s2, -1
621; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
622; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
623; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
624; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
625; GFX7-NEXT:    s_addc_u32 s21, s21, 0
626; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
627; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
628; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
629; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
630; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
631; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
632; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
633; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
634; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
635; GFX7-NEXT:    s_and_b32 s5, s5, 15
636; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
637; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
638; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
639; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
640; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
641; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
642; GFX7-NEXT:    s_and_b32 s4, s4, 15
643; GFX7-NEXT:    v_mov_b32_e32 v1, s5
644; GFX7-NEXT:    v_mov_b32_e32 v2, s19
645; GFX7-NEXT:    v_mov_b32_e32 v3, s18
646; GFX7-NEXT:    v_mov_b32_e32 v4, s17
647; GFX7-NEXT:    v_mov_b32_e32 v5, s16
648; GFX7-NEXT:    v_mov_b32_e32 v6, s15
649; GFX7-NEXT:    v_mov_b32_e32 v7, s14
650; GFX7-NEXT:    s_waitcnt vmcnt(0)
651; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
652; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
653; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
654; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
655; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
656; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
657; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
658; GFX7-NEXT:    v_mov_b32_e32 v1, s13
659; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
660; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
661; GFX7-NEXT:    s_endpgm
662;
663; GFX8-LABEL: udot8_acc8:
664; GFX8:       ; %bb.0: ; %entry
665; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
666; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
667; GFX8-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
668; GFX8-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
669; GFX8-NEXT:    s_mov_b32 s18, -1
670; GFX8-NEXT:    s_mov_b32 s19, 0xe80000
671; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX8-NEXT:    v_mov_b32_e32 v0, s0
673; GFX8-NEXT:    v_mov_b32_e32 v1, s1
674; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
675; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
676; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
677; GFX8-NEXT:    s_add_u32 s16, s16, s3
678; GFX8-NEXT:    s_addc_u32 s17, s17, 0
679; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
680; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
681; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
682; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
683; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
684; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x4000c
685; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
686; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
687; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
688; GFX8-NEXT:    s_and_b32 s1, s1, 15
689; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
690; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
691; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
692; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x4000c
693; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
694; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
695; GFX8-NEXT:    s_and_b32 s0, s0, 15
696; GFX8-NEXT:    v_mov_b32_e32 v3, s1
697; GFX8-NEXT:    v_mov_b32_e32 v4, s15
698; GFX8-NEXT:    v_mov_b32_e32 v5, s14
699; GFX8-NEXT:    v_mov_b32_e32 v6, s13
700; GFX8-NEXT:    v_mov_b32_e32 v7, s12
701; GFX8-NEXT:    v_mov_b32_e32 v8, s11
702; GFX8-NEXT:    v_mov_b32_e32 v9, s10
703; GFX8-NEXT:    s_waitcnt vmcnt(0)
704; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
705; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
706; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
707; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
708; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
709; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
710; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
711; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
712; GFX8-NEXT:    v_mov_b32_e32 v3, s9
713; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
714; GFX8-NEXT:    flat_store_byte v[0:1], v2
715; GFX8-NEXT:    s_endpgm
716;
717; GFX9-LABEL: udot8_acc8:
718; GFX9:       ; %bb.0: ; %entry
719; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
720; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
721; GFX9-NEXT:    v_mov_b32_e32 v0, 0
722; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
723; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
724; GFX9-NEXT:    s_mov_b32 s22, -1
725; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
726; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
727; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
728; GFX9-NEXT:    s_add_u32 s20, s20, s3
729; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
730; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
731; GFX9-NEXT:    s_addc_u32 s21, s21, 0
732; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
734; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
735; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
736; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
737; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x4000c
738; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40008
739; GFX9-NEXT:    s_bfe_u32 s17, s3, 0x40004
740; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
741; GFX9-NEXT:    s_and_b32 s3, s3, 15
742; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
743; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
744; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
745; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
746; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
747; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40004
748; GFX9-NEXT:    s_and_b32 s2, s2, 15
749; GFX9-NEXT:    v_mov_b32_e32 v2, s3
750; GFX9-NEXT:    v_mov_b32_e32 v3, s17
751; GFX9-NEXT:    v_mov_b32_e32 v4, s16
752; GFX9-NEXT:    v_mov_b32_e32 v5, s15
753; GFX9-NEXT:    v_mov_b32_e32 v6, s14
754; GFX9-NEXT:    v_mov_b32_e32 v7, s13
755; GFX9-NEXT:    v_mov_b32_e32 v8, s12
756; GFX9-NEXT:    s_waitcnt vmcnt(0)
757; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
758; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
759; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
760; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
761; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
762; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
763; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
764; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
765; GFX9-NEXT:    v_mov_b32_e32 v2, s11
766; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
767; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
768; GFX9-NEXT:    s_endpgm
769;
770; GFX9-DL-LABEL: udot8_acc8:
771; GFX9-DL:       ; %bb.0: ; %entry
772; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
773; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
774; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
775; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
776; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
777; GFX9-DL-NEXT:    s_mov_b32 s22, -1
778; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
779; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
780; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
781; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
782; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
783; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
784; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
785; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
787; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
788; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
789; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
790; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x4000c
791; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40008
792; GFX9-DL-NEXT:    s_bfe_u32 s17, s3, 0x40004
793; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
794; GFX9-DL-NEXT:    s_and_b32 s3, s3, 15
795; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
796; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
797; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
798; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
799; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
800; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40004
801; GFX9-DL-NEXT:    s_and_b32 s2, s2, 15
802; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
803; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
804; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
805; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
806; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
807; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
808; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
809; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
810; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
811; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
812; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xff, v1
813; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
814; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
815; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
816; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
817; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
818; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
819; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
820; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
821; GFX9-DL-NEXT:    s_endpgm
822;
823; GFX10-DL-LABEL: udot8_acc8:
824; GFX10-DL:       ; %bb.0: ; %entry
825; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
826; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
827; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
828; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
829; GFX10-DL-NEXT:    s_mov_b32 s10, -1
830; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
831; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
832; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
833; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
834; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
836; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
837; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
838; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
839; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
840; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
841; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
842; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
843; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
844; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
845; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
846; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
847; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40008
848; GFX10-DL-NEXT:    v_and_b32_e32 v1, 0xff, v1
849; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
850; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x4000c
851; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x4000c
852; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
853; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
854; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
855; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
856; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
857; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
858; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
859; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
860; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
861; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
862; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
863; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
864; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
865; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
866; GFX10-DL-NEXT:    s_endpgm
867                                      <8 x i4> addrspace(1)* %src2,
868                                      i8 addrspace(1)* nocapture %dst) {
869entry:
870  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
871  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
872
873  %v1e0 = extractelement <8 x i4> %vec1, i64 0
874  %cv1e0 = zext i4 %v1e0 to i8
875  %v2e0 = extractelement <8 x i4> %vec2, i64 0
876  %cv2e0 = zext i4 %v2e0 to i8
877  %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
878
879  %v1e1 = extractelement <8 x i4> %vec1, i64 1
880  %cv1e1 = zext i4 %v1e1 to i8
881  %v2e1 = extractelement <8 x i4> %vec2, i64 1
882  %cv2e1 = zext i4 %v2e1 to i8
883  %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
884
885  %v1e2 = extractelement <8 x i4> %vec1, i64 2
886  %cv1e2 = zext i4 %v1e2 to i8
887  %v2e2 = extractelement <8 x i4> %vec2, i64 2
888  %cv2e2 = zext i4 %v2e2 to i8
889  %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
890
891  %v1e3 = extractelement <8 x i4> %vec1, i64 3
892  %cv1e3 = zext i4 %v1e3 to i8
893  %v2e3 = extractelement <8 x i4> %vec2, i64 3
894  %cv2e3 = zext i4 %v2e3 to i8
895  %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
896
897  %v1e4 = extractelement <8 x i4> %vec1, i64 4
898  %cv1e4 = zext i4 %v1e4 to i8
899  %v2e4 = extractelement <8 x i4> %vec2, i64 4
900  %cv2e4 = zext i4 %v2e4 to i8
901  %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
902
903  %v1e5 = extractelement <8 x i4> %vec1, i64 5
904  %cv1e5 = zext i4 %v1e5 to i8
905  %v2e5 = extractelement <8 x i4> %vec2, i64 5
906  %cv2e5 = zext i4 %v2e5 to i8
907  %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
908
909  %v1e6 = extractelement <8 x i4> %vec1, i64 6
910  %cv1e6 = zext i4 %v1e6 to i8
911  %v2e6 = extractelement <8 x i4> %vec2, i64 6
912  %cv2e6 = zext i4 %v2e6 to i8
913  %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
914
915  %v1e7 = extractelement <8 x i4> %vec1, i64 7
916  %cv1e7 = zext i4 %v1e7 to i8
917  %v2e7 = extractelement <8 x i4> %vec2, i64 7
918  %cv2e7 = zext i4 %v2e7 to i8
919  %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
920
921  %acc = load i8, i8 addrspace(1)* %dst, align 4
922  %add1 = add i8 %mul0, %acc
923  %add2 = add i8 %add1, %mul1
924  %add3 = add i8 %add2, %mul2
925  %add4 = add i8 %add3, %mul3
926  %add5 = add i8 %add4, %mul4
927  %add6 = add i8 %add5, %mul5
928  %add7 = add i8 %add6, %mul6
929  %add8 = add i8 %add7, %mul7
930
931  store i8 %add8, i8 addrspace(1)* %dst, align 4
932  ret void
933}
934
935; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD)
936; to have the pattern-recognizer to kick in.
937define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
938; GFX7-LABEL: udot8_acc4:
939; GFX7:       ; %bb.0: ; %entry
940; GFX7-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
941; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
942; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
943; GFX7-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
944; GFX7-NEXT:    s_mov_b32 s22, -1
945; GFX7-NEXT:    s_mov_b32 s23, 0xe8f000
946; GFX7-NEXT:    s_add_u32 s20, s20, s3
947; GFX7-NEXT:    s_mov_b32 s3, 0xf000
948; GFX7-NEXT:    s_mov_b32 s2, -1
949; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
950; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
951; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
952; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
953; GFX7-NEXT:    s_addc_u32 s21, s21, 0
954; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
955; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
956; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
957; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
958; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
959; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
960; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
961; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
962; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
963; GFX7-NEXT:    s_and_b32 s5, s5, 15
964; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
965; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
966; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
967; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
968; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
969; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
970; GFX7-NEXT:    s_and_b32 s4, s4, 15
971; GFX7-NEXT:    v_mov_b32_e32 v1, s5
972; GFX7-NEXT:    v_mov_b32_e32 v2, s19
973; GFX7-NEXT:    v_mov_b32_e32 v3, s18
974; GFX7-NEXT:    v_mov_b32_e32 v4, s17
975; GFX7-NEXT:    v_mov_b32_e32 v5, s16
976; GFX7-NEXT:    v_mov_b32_e32 v6, s15
977; GFX7-NEXT:    v_mov_b32_e32 v7, s14
978; GFX7-NEXT:    s_waitcnt vmcnt(0)
979; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
980; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
981; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
982; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
983; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
984; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
985; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
986; GFX7-NEXT:    v_mov_b32_e32 v1, s13
987; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
988; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
989; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
990; GFX7-NEXT:    s_endpgm
991;
992; GFX8-LABEL: udot8_acc4:
993; GFX8:       ; %bb.0: ; %entry
994; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
995; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
996; GFX8-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
997; GFX8-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
998; GFX8-NEXT:    s_mov_b32 s18, -1
999; GFX8-NEXT:    s_mov_b32 s19, 0xe80000
1000; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1001; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1002; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1003; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
1004; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1005; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
1006; GFX8-NEXT:    s_add_u32 s16, s16, s3
1007; GFX8-NEXT:    s_addc_u32 s17, s17, 0
1008; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1009; GFX8-NEXT:    s_and_b32 s8, s0, 15
1010; GFX8-NEXT:    s_and_b32 s15, s1, 15
1011; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40004
1012; GFX8-NEXT:    v_mov_b32_e32 v4, s15
1013; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
1014; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
1015; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
1016; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40008
1017; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
1018; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
1019; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40004
1020; GFX8-NEXT:    v_mov_b32_e32 v5, s14
1021; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
1022; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
1023; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
1024; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
1025; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
1026; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
1027; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1028; GFX8-NEXT:    v_mov_b32_e32 v6, s13
1029; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
1030; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1031; GFX8-NEXT:    v_mov_b32_e32 v7, s12
1032; GFX8-NEXT:    v_mov_b32_e32 v8, s11
1033; GFX8-NEXT:    v_mov_b32_e32 v9, s10
1034; GFX8-NEXT:    s_waitcnt vmcnt(0)
1035; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
1036; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
1037; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
1038; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1039; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1040; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
1041; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
1042; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
1043; GFX8-NEXT:    v_mov_b32_e32 v3, s9
1044; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
1045; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1046; GFX8-NEXT:    flat_store_byte v[0:1], v2
1047; GFX8-NEXT:    s_endpgm
1048;
1049; GFX9-LABEL: udot8_acc4:
1050; GFX9:       ; %bb.0: ; %entry
1051; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1052; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1053; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1054; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1055; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1056; GFX9-NEXT:    s_mov_b32 s22, -1
1057; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1058; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
1059; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
1060; GFX9-NEXT:    s_add_u32 s20, s20, s3
1061; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
1062; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
1063; GFX9-NEXT:    s_addc_u32 s21, s21, 0
1064; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1065; GFX9-NEXT:    s_and_b32 s10, s2, 15
1066; GFX9-NEXT:    s_and_b32 s17, s3, 15
1067; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40004
1068; GFX9-NEXT:    v_mov_b32_e32 v3, s17
1069; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
1070; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
1071; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
1072; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40008
1073; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
1074; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x4000c
1075; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40004
1076; GFX9-NEXT:    v_mov_b32_e32 v4, s16
1077; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
1078; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
1079; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
1080; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
1081; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
1082; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x4000c
1083; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1084; GFX9-NEXT:    v_mov_b32_e32 v5, s15
1085; GFX9-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
1086; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1087; GFX9-NEXT:    v_mov_b32_e32 v6, s14
1088; GFX9-NEXT:    v_mov_b32_e32 v7, s13
1089; GFX9-NEXT:    v_mov_b32_e32 v8, s12
1090; GFX9-NEXT:    s_waitcnt vmcnt(0)
1091; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
1092; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
1093; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
1094; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1095; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
1096; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
1097; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
1098; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
1099; GFX9-NEXT:    v_mov_b32_e32 v2, s11
1100; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1101; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1102; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
1103; GFX9-NEXT:    s_endpgm
1104;
1105; GFX9-DL-LABEL: udot8_acc4:
1106; GFX9-DL:       ; %bb.0: ; %entry
1107; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1108; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1109; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1110; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1111; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1112; GFX9-DL-NEXT:    s_mov_b32 s22, -1
1113; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
1115; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
1116; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
1117; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1118; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
1119; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
1120; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1121; GFX9-DL-NEXT:    s_and_b32 s10, s2, 15
1122; GFX9-DL-NEXT:    s_and_b32 s17, s3, 15
1123; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40004
1124; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
1125; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
1126; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
1127; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
1128; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x40008
1129; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
1130; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x4000c
1131; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40004
1132; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
1133; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
1134; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
1135; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
1136; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
1137; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
1138; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x4000c
1139; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
1140; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
1141; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
1142; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1143; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
1144; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
1145; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
1146; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1147; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
1148; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
1149; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
1150; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1151; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
1152; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
1153; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
1154; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
1155; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
1156; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1157; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1158; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
1159; GFX9-DL-NEXT:    s_endpgm
1160;
1161; GFX10-DL-LABEL: udot8_acc4:
1162; GFX10-DL:       ; %bb.0: ; %entry
1163; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1164; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1165; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1166; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1167; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1168; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1169; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1170; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1171; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1172; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1173; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
1174; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1175; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1176; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1177; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
1178; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
1179; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
1180; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
1181; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1182; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1183; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
1184; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
1185; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1186; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
1187; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
1188; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v2, s3, s7
1189; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s6, v1
1190; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
1191; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
1192; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1193; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1194; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1195; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1196; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
1197; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
1198; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1199; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
1200; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
1201; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
1202; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
1203; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1204; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
1205; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1206; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
1207; GFX10-DL-NEXT:    s_endpgm
1208                                      <8 x i4> addrspace(1)* %src2,
1209                                      i4 addrspace(1)* nocapture %dst) {
1210entry:
1211  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1212  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1213
1214  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1215  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1216  %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1217
1218  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1219  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1220  %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1221
1222  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1223  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1224  %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1225
1226  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1227  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1228  %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1229
1230  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1231  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1232  %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1233
1234  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1235  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1236  %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1237
1238  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1239  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1240  %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1241
1242  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1243  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1244  %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1245
1246  %acc = load i4, i4 addrspace(1)* %dst, align 4
1247  %add1 = add i4 %mul0, %acc
1248  %add2 = add i4 %add1, %mul1
1249  %add3 = add i4 %add2, %mul2
1250  %add4 = add i4 %add3, %mul3
1251  %add5 = add i4 %add4, %mul4
1252  %add6 = add i4 %add5, %mul5
1253  %add7 = add i4 %add6, %mul6
1254  %add8 = add i4 %add7, %mul7
1255
1256  store i4 %add8, i4 addrspace(1)* %dst, align 4
1257  ret void
1258}
1259
1260; TODO: Currently, permutation of udot8 is turned off due to a huge increase
1261; in the compile time.
1262define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1,
1263; GFX7-LABEL: udot8_CommutationInsideMAD:
1264; GFX7:       ; %bb.0: ; %entry
1265; GFX7-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1266; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1267; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1268; GFX7-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1269; GFX7-NEXT:    s_mov_b32 s22, -1
1270; GFX7-NEXT:    s_mov_b32 s23, 0xe8f000
1271; GFX7-NEXT:    s_add_u32 s20, s20, s3
1272; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1273; GFX7-NEXT:    s_mov_b32 s2, -1
1274; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1275; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
1276; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1277; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1278; GFX7-NEXT:    s_addc_u32 s21, s21, 0
1279; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1280; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
1281; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
1282; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
1283; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
1284; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
1285; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
1286; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
1287; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
1288; GFX7-NEXT:    s_and_b32 s5, s5, 15
1289; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
1290; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
1291; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
1292; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
1293; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
1294; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
1295; GFX7-NEXT:    s_and_b32 s4, s4, 15
1296; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1297; GFX7-NEXT:    v_mov_b32_e32 v2, s19
1298; GFX7-NEXT:    v_mov_b32_e32 v3, s18
1299; GFX7-NEXT:    v_mov_b32_e32 v4, s17
1300; GFX7-NEXT:    v_mov_b32_e32 v5, s16
1301; GFX7-NEXT:    v_mov_b32_e32 v6, s15
1302; GFX7-NEXT:    v_mov_b32_e32 v7, s14
1303; GFX7-NEXT:    s_waitcnt vmcnt(0)
1304; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
1305; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
1306; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
1307; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
1308; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
1309; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
1310; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
1311; GFX7-NEXT:    v_mov_b32_e32 v1, s13
1312; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
1313; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1314; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1315; GFX7-NEXT:    s_endpgm
1316;
1317; GFX8-LABEL: udot8_CommutationInsideMAD:
1318; GFX8:       ; %bb.0: ; %entry
1319; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1320; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1321; GFX8-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
1322; GFX8-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
1323; GFX8-NEXT:    s_mov_b32 s18, -1
1324; GFX8-NEXT:    s_mov_b32 s19, 0xe80000
1325; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1326; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1327; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1328; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
1329; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1330; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
1331; GFX8-NEXT:    s_add_u32 s16, s16, s3
1332; GFX8-NEXT:    s_addc_u32 s17, s17, 0
1333; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1334; GFX8-NEXT:    s_and_b32 s8, s0, 15
1335; GFX8-NEXT:    s_and_b32 s15, s1, 15
1336; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40004
1337; GFX8-NEXT:    v_mov_b32_e32 v4, s15
1338; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
1339; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
1340; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
1341; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40008
1342; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
1343; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
1344; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40004
1345; GFX8-NEXT:    v_mov_b32_e32 v5, s14
1346; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
1347; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
1348; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
1349; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
1350; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
1351; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
1352; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1353; GFX8-NEXT:    v_mov_b32_e32 v6, s13
1354; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
1355; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1356; GFX8-NEXT:    v_mov_b32_e32 v7, s12
1357; GFX8-NEXT:    v_mov_b32_e32 v8, s11
1358; GFX8-NEXT:    v_mov_b32_e32 v9, s10
1359; GFX8-NEXT:    s_waitcnt vmcnt(0)
1360; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
1361; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
1362; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
1363; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1364; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
1365; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
1366; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
1367; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
1368; GFX8-NEXT:    v_mov_b32_e32 v3, s9
1369; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
1370; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1371; GFX8-NEXT:    flat_store_byte v[0:1], v2
1372; GFX8-NEXT:    s_endpgm
1373;
1374; GFX9-LABEL: udot8_CommutationInsideMAD:
1375; GFX9:       ; %bb.0: ; %entry
1376; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1377; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1378; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1379; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1380; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1381; GFX9-NEXT:    s_mov_b32 s22, -1
1382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1383; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
1384; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
1385; GFX9-NEXT:    s_add_u32 s20, s20, s3
1386; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
1387; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
1388; GFX9-NEXT:    s_addc_u32 s21, s21, 0
1389; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1390; GFX9-NEXT:    s_and_b32 s10, s2, 15
1391; GFX9-NEXT:    s_and_b32 s17, s3, 15
1392; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40004
1393; GFX9-NEXT:    v_mov_b32_e32 v3, s17
1394; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
1395; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
1396; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
1397; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40008
1398; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
1399; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x4000c
1400; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40004
1401; GFX9-NEXT:    v_mov_b32_e32 v4, s16
1402; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
1403; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
1404; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
1405; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
1406; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
1407; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x4000c
1408; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1409; GFX9-NEXT:    v_mov_b32_e32 v5, s15
1410; GFX9-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
1411; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1412; GFX9-NEXT:    v_mov_b32_e32 v6, s14
1413; GFX9-NEXT:    v_mov_b32_e32 v7, s13
1414; GFX9-NEXT:    v_mov_b32_e32 v8, s12
1415; GFX9-NEXT:    s_waitcnt vmcnt(0)
1416; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
1417; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
1418; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
1419; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1420; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
1421; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
1422; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
1423; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
1424; GFX9-NEXT:    v_mov_b32_e32 v2, s11
1425; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1426; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1427; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
1428; GFX9-NEXT:    s_endpgm
1429;
1430; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
1431; GFX9-DL:       ; %bb.0: ; %entry
1432; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1433; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1434; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1435; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1436; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1437; GFX9-DL-NEXT:    s_mov_b32 s22, -1
1438; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1439; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
1440; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
1441; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
1442; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1443; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
1444; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
1445; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1446; GFX9-DL-NEXT:    s_and_b32 s10, s2, 15
1447; GFX9-DL-NEXT:    s_and_b32 s17, s3, 15
1448; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40004
1449; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
1450; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
1451; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
1452; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
1453; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x40008
1454; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
1455; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x4000c
1456; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40004
1457; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
1458; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
1459; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
1460; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
1461; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
1462; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
1463; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x4000c
1464; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
1465; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
1466; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
1467; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1468; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
1469; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
1470; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
1471; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1472; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
1473; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
1474; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
1475; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1476; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
1477; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
1478; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
1479; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
1480; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
1481; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1482; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1483; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
1484; GFX9-DL-NEXT:    s_endpgm
1485;
1486; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
1487; GFX10-DL:       ; %bb.0: ; %entry
1488; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1489; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1490; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1491; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1492; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1493; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1494; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1495; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1496; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1497; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
1499; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1500; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1501; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1502; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
1503; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
1504; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
1505; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
1506; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1507; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1508; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
1509; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
1510; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1511; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x4000c
1512; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x4000c
1513; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v2, s2, s3
1514; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
1515; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
1516; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
1517; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1518; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1519; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
1520; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1521; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
1522; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
1523; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1524; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
1525; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
1526; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
1527; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
1528; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1529; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
1530; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1531; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
1532; GFX10-DL-NEXT:    s_endpgm
1533                                                      <8 x i4> addrspace(1)* %src2,
1534                                                      i4 addrspace(1)* nocapture %dst) {
1535entry:
1536  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1537  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1538
1539  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1540  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1541  %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1542
1543  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1544  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1545  %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1546
1547  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1548  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1549  %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1550
1551  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1552  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1553  %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1554
1555  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1556  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1557  %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1558
1559  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1560  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1561  %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1562
1563  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1564  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1565  %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1566
1567  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1568  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1569  %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1570
1571  %acc = load i4, i4 addrspace(1)* %dst, align 4
1572  %add1 = add i4 %mul0, %acc
1573  %add2 = add i4 %mul1, %add1
1574  %add3 = add i4 %mul2, %add2
1575  %add4 = add i4 %mul3, %add3
1576  %add5 = add i4 %mul4, %add4
1577  %add6 = add i4 %mul5, %add5
1578  %add7 = add i4 %mul6, %add6
1579  %add8 = add i4 %mul7, %add7
1580
1581  store i4 %add8, i4 addrspace(1)* %dst, align 4
1582  ret void
1583}
1584
1585define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1586; GFX7-LABEL: udot8_multiuses_mul1:
1587; GFX7:       ; %bb.0: ; %entry
1588; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1589; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1590; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1591; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1592; GFX7-NEXT:    s_mov_b32 s26, -1
1593; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
1595; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
1596; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1597; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
1598; GFX7-NEXT:    s_add_u32 s24, s24, s3
1599; GFX7-NEXT:    s_addc_u32 s25, s25, 0
1600; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX7-NEXT:    s_bfe_u32 s19, s6, 0x40004
1602; GFX7-NEXT:    s_lshr_b32 s7, s6, 28
1603; GFX7-NEXT:    s_bfe_u32 s14, s6, 0x40018
1604; GFX7-NEXT:    s_bfe_u32 s15, s6, 0x40014
1605; GFX7-NEXT:    s_bfe_u32 s16, s6, 0x40010
1606; GFX7-NEXT:    s_bfe_u32 s17, s6, 0x4000c
1607; GFX7-NEXT:    s_bfe_u32 s18, s6, 0x40008
1608; GFX7-NEXT:    s_and_b32 s6, s6, 15
1609; GFX7-NEXT:    s_lshr_b32 s5, s4, 28
1610; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40018
1611; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40014
1612; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40010
1613; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x4000c
1614; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40008
1615; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x40004
1616; GFX7-NEXT:    s_and_b32 s4, s4, 15
1617; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1618; GFX7-NEXT:    v_mov_b32_e32 v1, s20
1619; GFX7-NEXT:    v_mad_u32_u24 v1, s4, v0, v1
1620; GFX7-NEXT:    v_mov_b32_e32 v2, s19
1621; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v0, v1
1622; GFX7-NEXT:    v_mad_u32_u24 v1, s13, v2, v1
1623; GFX7-NEXT:    v_mov_b32_e32 v2, s18
1624; GFX7-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
1625; GFX7-NEXT:    v_mov_b32_e32 v2, s17
1626; GFX7-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
1627; GFX7-NEXT:    v_mov_b32_e32 v2, s16
1628; GFX7-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
1629; GFX7-NEXT:    v_mov_b32_e32 v2, s15
1630; GFX7-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
1631; GFX7-NEXT:    v_mov_b32_e32 v2, s14
1632; GFX7-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
1633; GFX7-NEXT:    v_mov_b32_e32 v2, s7
1634; GFX7-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
1635; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1636; GFX7-NEXT:    s_mov_b32 s2, -1
1637; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1638; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1639; GFX7-NEXT:    s_endpgm
1640;
1641; GFX8-LABEL: udot8_multiuses_mul1:
1642; GFX8:       ; %bb.0: ; %entry
1643; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1644; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1645; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1646; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1647; GFX8-NEXT:    s_mov_b32 s22, -1
1648; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1649; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
1650; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
1651; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
1652; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
1653; GFX8-NEXT:    s_add_u32 s20, s20, s3
1654; GFX8-NEXT:    s_addc_u32 s21, s21, 0
1655; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1656; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40004
1657; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
1658; GFX8-NEXT:    s_bfe_u32 s12, s6, 0x40018
1659; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40014
1660; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40010
1661; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x4000c
1662; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x40008
1663; GFX8-NEXT:    s_and_b32 s6, s6, 15
1664; GFX8-NEXT:    s_lshr_b32 s3, s2, 28
1665; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40018
1666; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40014
1667; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
1668; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
1669; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40008
1670; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40004
1671; GFX8-NEXT:    s_and_b32 s2, s2, 15
1672; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1673; GFX8-NEXT:    v_mov_b32_e32 v1, s18
1674; GFX8-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
1675; GFX8-NEXT:    v_mov_b32_e32 v2, s17
1676; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
1677; GFX8-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
1678; GFX8-NEXT:    v_mov_b32_e32 v2, s16
1679; GFX8-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
1680; GFX8-NEXT:    v_mov_b32_e32 v2, s15
1681; GFX8-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
1682; GFX8-NEXT:    v_mov_b32_e32 v2, s14
1683; GFX8-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
1684; GFX8-NEXT:    v_mov_b32_e32 v2, s13
1685; GFX8-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
1686; GFX8-NEXT:    v_mov_b32_e32 v2, s12
1687; GFX8-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1688; GFX8-NEXT:    v_mov_b32_e32 v2, s7
1689; GFX8-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
1690; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
1691; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1692; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1693; GFX8-NEXT:    flat_store_dword v[0:1], v2
1694; GFX8-NEXT:    s_endpgm
1695;
1696; GFX9-LABEL: udot8_multiuses_mul1:
1697; GFX9:       ; %bb.0: ; %entry
1698; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1699; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1700; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1701; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1702; GFX9-NEXT:    s_mov_b32 s22, -1
1703; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1704; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
1705; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
1706; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
1707; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
1708; GFX9-NEXT:    s_add_u32 s20, s20, s3
1709; GFX9-NEXT:    s_addc_u32 s21, s21, 0
1710; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40004
1712; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
1713; GFX9-NEXT:    s_bfe_u32 s12, s6, 0x40018
1714; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40014
1715; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
1716; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x4000c
1717; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
1718; GFX9-NEXT:    s_and_b32 s6, s6, 15
1719; GFX9-NEXT:    s_lshr_b32 s3, s2, 28
1720; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
1721; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40014
1722; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
1723; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
1724; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
1725; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
1726; GFX9-NEXT:    s_and_b32 s2, s2, 15
1727; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1728; GFX9-NEXT:    v_mov_b32_e32 v2, s18
1729; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v1, v2
1730; GFX9-NEXT:    v_mov_b32_e32 v3, s17
1731; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
1732; GFX9-NEXT:    v_mad_u32_u24 v2, s11, v3, v2
1733; GFX9-NEXT:    v_mov_b32_e32 v3, s16
1734; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
1735; GFX9-NEXT:    v_mov_b32_e32 v3, s15
1736; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v3, v2
1737; GFX9-NEXT:    v_mov_b32_e32 v3, s14
1738; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
1739; GFX9-NEXT:    v_mov_b32_e32 v3, s13
1740; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
1741; GFX9-NEXT:    v_mov_b32_e32 v3, s12
1742; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
1743; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1744; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
1745; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1746; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
1747; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1748; GFX9-NEXT:    s_endpgm
1749;
1750; GFX9-DL-LABEL: udot8_multiuses_mul1:
1751; GFX9-DL:       ; %bb.0: ; %entry
1752; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1753; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1754; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1755; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1756; GFX9-DL-NEXT:    s_mov_b32 s22, -1
1757; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1758; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
1759; GFX9-DL-NEXT:    s_load_dword s18, s[0:1], 0x0
1760; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1761; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
1762; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
1763; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
1764; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1765; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x40004
1766; GFX9-DL-NEXT:    s_lshr_b32 s7, s6, 28
1767; GFX9-DL-NEXT:    s_bfe_u32 s12, s6, 0x40018
1768; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40014
1769; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40010
1770; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x4000c
1771; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x40008
1772; GFX9-DL-NEXT:    s_and_b32 s6, s6, 15
1773; GFX9-DL-NEXT:    s_lshr_b32 s3, s2, 28
1774; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40018
1775; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
1776; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
1777; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
1778; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40008
1779; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40004
1780; GFX9-DL-NEXT:    s_and_b32 s2, s2, 15
1781; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
1782; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s18
1783; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v1, v2
1784; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
1785; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
1786; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s11, v3, v2
1787; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s16
1788; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
1789; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s15
1790; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v3, v2
1791; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s14
1792; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
1793; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s13
1794; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
1795; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s12
1796; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
1797; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
1798; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
1799; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1800; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
1801; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1802; GFX9-DL-NEXT:    s_endpgm
1803;
1804; GFX10-DL-LABEL: udot8_multiuses_mul1:
1805; GFX10-DL:       ; %bb.0: ; %entry
1806; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1807; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1808; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1809; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1810; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1811; GFX10-DL-NEXT:    s_clause 0x1
1812; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1813; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1814; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1815; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1816; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1817; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1818; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1819; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1820; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1821; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1822; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
1823; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
1824; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40004
1825; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40004
1826; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
1827; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v0
1828; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
1829; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
1830; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
1831; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
1832; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x4000c
1833; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
1834; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
1835; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
1836; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40010
1837; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
1838; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40014
1839; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40014
1840; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
1841; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40018
1842; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40018
1843; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
1844; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
1845; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
1846; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
1847; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v0, v1
1848; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
1849; GFX10-DL-NEXT:    s_endpgm
1850                                                <8 x i4> addrspace(1)* %src2,
1851                                                i32 addrspace(1)* nocapture %dst) {
1852entry:
1853  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1854  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1855
1856  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1857  %cv1e0 = zext i4 %v1e0 to i32
1858  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1859  %cv2e0 = zext i4 %v2e0 to i32
1860  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1861
1862  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1863  %cv1e1 = zext i4 %v1e1 to i32
1864  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1865  %cv2e1 = zext i4 %v2e1 to i32
1866  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1867
1868  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1869  %cv1e2 = zext i4 %v1e2 to i32
1870  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1871  %cv2e2 = zext i4 %v2e2 to i32
1872  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1873
1874  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1875  %cv1e3 = zext i4 %v1e3 to i32
1876  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1877  %cv2e3 = zext i4 %v2e3 to i32
1878  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1879
1880  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1881  %cv1e4 = zext i4 %v1e4 to i32
1882  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1883  %cv2e4 = zext i4 %v2e4 to i32
1884  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1885
1886  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1887  %cv1e5 = zext i4 %v1e5 to i32
1888  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1889  %cv2e5 = zext i4 %v2e5 to i32
1890  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1891
1892  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1893  %cv1e6 = zext i4 %v1e6 to i32
1894  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1895  %cv2e6 = zext i4 %v2e6 to i32
1896  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1897
1898  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1899  %cv1e7 = zext i4 %v1e7 to i32
1900  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1901  %cv2e7 = zext i4 %v2e7 to i32
1902  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1903
1904  %acc = load i32, i32 addrspace(1)* %dst, align 4
1905  %add1 = add i32 %mul0, %acc
1906  %add = add i32  %mul0, %add1
1907  %add2 = add i32 %add1, %mul1
1908  %add3 = add i32 %add2, %mul2
1909  %add4 = add i32 %add3, %mul3
1910  %add5 = add i32 %add4, %mul4
1911  %add6 = add i32 %add5, %mul5
1912  %add7 = add i32 %add6, %mul6
1913  %add8 = add i32 %add7, %mul7
1914
1915  %res = add i32 %add, %add8
1916  store i32 %res, i32 addrspace(1)* %dst, align 4
1917  ret void
1918}
1919
1920define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1921; GFX7-LABEL: udot8_acc32_vecMul:
1922; GFX7:       ; %bb.0: ; %entry
1923; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1924; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1925; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1926; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1927; GFX7-NEXT:    s_mov_b32 s26, -1
1928; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1929; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
1930; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
1931; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1932; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
1933; GFX7-NEXT:    s_add_u32 s24, s24, s3
1934; GFX7-NEXT:    s_addc_u32 s25, s25, 0
1935; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1936; GFX7-NEXT:    s_lshr_b32 s7, s6, 28
1937; GFX7-NEXT:    s_bfe_u32 s14, s6, 0x40018
1938; GFX7-NEXT:    s_bfe_u32 s15, s6, 0x40014
1939; GFX7-NEXT:    s_bfe_u32 s16, s6, 0x40010
1940; GFX7-NEXT:    s_bfe_u32 s17, s6, 0x4000c
1941; GFX7-NEXT:    s_bfe_u32 s18, s6, 0x40008
1942; GFX7-NEXT:    s_bfe_u32 s19, s6, 0x40004
1943; GFX7-NEXT:    s_and_b32 s6, s6, 15
1944; GFX7-NEXT:    s_lshr_b32 s5, s4, 28
1945; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40018
1946; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40014
1947; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40010
1948; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x4000c
1949; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40008
1950; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x40004
1951; GFX7-NEXT:    s_and_b32 s4, s4, 15
1952; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1953; GFX7-NEXT:    v_mov_b32_e32 v1, s20
1954; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v0, v1
1955; GFX7-NEXT:    v_mov_b32_e32 v1, s19
1956; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
1957; GFX7-NEXT:    v_mov_b32_e32 v1, s18
1958; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
1959; GFX7-NEXT:    v_mov_b32_e32 v1, s17
1960; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
1961; GFX7-NEXT:    v_mov_b32_e32 v1, s16
1962; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
1963; GFX7-NEXT:    v_mov_b32_e32 v1, s15
1964; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
1965; GFX7-NEXT:    v_mov_b32_e32 v1, s14
1966; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
1967; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1968; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1969; GFX7-NEXT:    s_mov_b32 s2, -1
1970; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
1971; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1972; GFX7-NEXT:    s_endpgm
1973;
1974; GFX8-LABEL: udot8_acc32_vecMul:
1975; GFX8:       ; %bb.0: ; %entry
1976; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1977; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1978; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1979; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1980; GFX8-NEXT:    s_mov_b32 s22, -1
1981; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1982; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
1983; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
1984; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
1985; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
1986; GFX8-NEXT:    s_add_u32 s20, s20, s3
1987; GFX8-NEXT:    s_addc_u32 s21, s21, 0
1988; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1989; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
1990; GFX8-NEXT:    s_bfe_u32 s12, s6, 0x40018
1991; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40014
1992; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40010
1993; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x4000c
1994; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x40008
1995; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40004
1996; GFX8-NEXT:    s_and_b32 s6, s6, 15
1997; GFX8-NEXT:    s_lshr_b32 s3, s2, 28
1998; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40018
1999; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40014
2000; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
2001; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
2002; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40008
2003; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40004
2004; GFX8-NEXT:    s_and_b32 s2, s2, 15
2005; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2006; GFX8-NEXT:    v_mov_b32_e32 v1, s18
2007; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
2008; GFX8-NEXT:    v_mov_b32_e32 v1, s17
2009; GFX8-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
2010; GFX8-NEXT:    v_mov_b32_e32 v1, s16
2011; GFX8-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
2012; GFX8-NEXT:    v_mov_b32_e32 v1, s15
2013; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
2014; GFX8-NEXT:    v_mov_b32_e32 v1, s14
2015; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
2016; GFX8-NEXT:    v_mov_b32_e32 v1, s13
2017; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
2018; GFX8-NEXT:    v_mov_b32_e32 v1, s12
2019; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
2020; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2021; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
2022; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2023; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2024; GFX8-NEXT:    flat_store_dword v[0:1], v2
2025; GFX8-NEXT:    s_endpgm
2026;
2027; GFX9-LABEL: udot8_acc32_vecMul:
2028; GFX9:       ; %bb.0: ; %entry
2029; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2030; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2031; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2032; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2033; GFX9-NEXT:    s_mov_b32 s22, -1
2034; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2035; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
2036; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
2037; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
2038; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
2039; GFX9-NEXT:    s_add_u32 s20, s20, s3
2040; GFX9-NEXT:    s_addc_u32 s21, s21, 0
2041; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2042; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
2043; GFX9-NEXT:    s_bfe_u32 s12, s6, 0x40018
2044; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40014
2045; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
2046; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x4000c
2047; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
2048; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40004
2049; GFX9-NEXT:    s_and_b32 s6, s6, 15
2050; GFX9-NEXT:    s_lshr_b32 s3, s2, 28
2051; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
2052; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40014
2053; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
2054; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
2055; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
2056; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
2057; GFX9-NEXT:    s_and_b32 s2, s2, 15
2058; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2059; GFX9-NEXT:    v_mov_b32_e32 v2, s18
2060; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
2061; GFX9-NEXT:    v_mov_b32_e32 v2, s17
2062; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
2063; GFX9-NEXT:    v_mov_b32_e32 v2, s16
2064; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
2065; GFX9-NEXT:    v_mov_b32_e32 v2, s15
2066; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
2067; GFX9-NEXT:    v_mov_b32_e32 v2, s14
2068; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
2069; GFX9-NEXT:    v_mov_b32_e32 v2, s13
2070; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
2071; GFX9-NEXT:    v_mov_b32_e32 v2, s12
2072; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
2073; GFX9-NEXT:    v_mov_b32_e32 v2, s7
2074; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2075; GFX9-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
2076; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2077; GFX9-NEXT:    s_endpgm
2078;
2079; GFX9-DL-LABEL: udot8_acc32_vecMul:
2080; GFX9-DL:       ; %bb.0: ; %entry
2081; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2082; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2083; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2084; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2085; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2086; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2087; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2088; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2089; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
2090; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
2091; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
2092; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2093; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2094; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2095; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
2096; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
2097; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, s4, v1, v2
2098; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2099; GFX9-DL-NEXT:    s_endpgm
2100;
2101; GFX10-DL-LABEL: udot8_acc32_vecMul:
2102; GFX10-DL:       ; %bb.0: ; %entry
2103; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2104; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2105; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2106; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2107; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2108; GFX10-DL-NEXT:    s_clause 0x1
2109; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2110; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2111; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2112; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2113; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2114; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
2115; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2116; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2117; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2118; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
2119; GFX10-DL-NEXT:    v_dot8_u32_u4 v0, s0, s1, v0
2120; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
2121; GFX10-DL-NEXT:    s_endpgm
2122                                              <8 x i4> addrspace(1)* %src2,
2123                                              i32 addrspace(1)* nocapture %dst) {
2124entry:
2125  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
2126  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
2127
2128  %cvec1 = zext <8 x i4> %vec1 to <8 x i32>
2129  %cvec2 = zext <8 x i4> %vec2 to <8 x i32>
2130
2131  %mul = mul <8 x i32> %cvec1, %cvec2
2132  %mul0 = extractelement <8 x i32> %mul, i64 0
2133  %mul1 = extractelement <8 x i32> %mul, i64 1
2134  %mul2 = extractelement <8 x i32> %mul, i64 2
2135  %mul3 = extractelement <8 x i32> %mul, i64 3
2136  %mul4 = extractelement <8 x i32> %mul, i64 4
2137  %mul5 = extractelement <8 x i32> %mul, i64 5
2138  %mul6 = extractelement <8 x i32> %mul, i64 6
2139  %mul7 = extractelement <8 x i32> %mul, i64 7
2140
2141  %acc = load i32, i32 addrspace(1)* %dst, align 4
2142  %add1 = add i32 %mul0, %acc
2143  %add2 = add i32 %add1, %mul1
2144  %add3 = add i32 %add2, %mul2
2145  %add4 = add i32 %add3, %mul3
2146  %add5 = add i32 %add4, %mul4
2147  %add6 = add i32 %add5, %mul5
2148  %add7 = add i32 %add6, %mul6
2149  %add8 = add i32 %add7, %mul7
2150
2151  store i32 %add8, i32 addrspace(1)* %dst, align 4
2152  ret void
2153}
2154
2155; TODO: Clean up the code(by default pk_mad_I16 should be generated), then
2156; support the pattern.
2157define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2158; GFX7-LABEL: udot8_acc16_vecMul:
2159; GFX7:       ; %bb.0: ; %entry
2160; GFX7-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2161; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2162; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2163; GFX7-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2164; GFX7-NEXT:    s_mov_b32 s22, -1
2165; GFX7-NEXT:    s_mov_b32 s23, 0xe8f000
2166; GFX7-NEXT:    s_add_u32 s20, s20, s3
2167; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2168; GFX7-NEXT:    s_mov_b32 s2, -1
2169; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2170; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
2171; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
2172; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
2173; GFX7-NEXT:    s_addc_u32 s21, s21, 0
2174; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2175; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
2176; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
2177; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
2178; GFX7-NEXT:    v_mov_b32_e32 v4, s17
2179; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
2180; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
2181; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
2182; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
2183; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
2184; GFX7-NEXT:    s_and_b32 s5, s5, 15
2185; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
2186; GFX7-NEXT:    v_mov_b32_e32 v2, s19
2187; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s12, v2
2188; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s10, v4
2189; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
2190; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
2191; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
2192; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
2193; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
2194; GFX7-NEXT:    v_mov_b32_e32 v3, s18
2195; GFX7-NEXT:    s_and_b32 s4, s4, 15
2196; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2197; GFX7-NEXT:    v_mul_u32_u24_e32 v1, s4, v1
2198; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2199; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s11, v3
2200; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
2201; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
2202; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
2203; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
2204; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
2205; GFX7-NEXT:    v_mov_b32_e32 v5, s16
2206; GFX7-NEXT:    v_mov_b32_e32 v6, s15
2207; GFX7-NEXT:    v_mov_b32_e32 v7, s14
2208; GFX7-NEXT:    s_waitcnt vmcnt(0)
2209; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2210; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2211; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
2212; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
2213; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
2214; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
2215; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
2216; GFX7-NEXT:    v_mov_b32_e32 v1, s13
2217; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
2218; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2219; GFX7-NEXT:    s_endpgm
2220;
2221; GFX8-LABEL: udot8_acc16_vecMul:
2222; GFX8:       ; %bb.0: ; %entry
2223; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2224; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2225; GFX8-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
2226; GFX8-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
2227; GFX8-NEXT:    s_mov_b32 s18, -1
2228; GFX8-NEXT:    s_mov_b32 s19, 0xe80000
2229; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2230; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2231; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2232; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
2233; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2234; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
2235; GFX8-NEXT:    s_add_u32 s16, s16, s3
2236; GFX8-NEXT:    s_addc_u32 s17, s17, 0
2237; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2238; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
2239; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
2240; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
2241; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
2242; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x4000c
2243; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
2244; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
2245; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
2246; GFX8-NEXT:    s_and_b32 s1, s1, 15
2247; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
2248; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
2249; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
2250; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x4000c
2251; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
2252; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
2253; GFX8-NEXT:    s_and_b32 s0, s0, 15
2254; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2255; GFX8-NEXT:    v_mov_b32_e32 v4, s15
2256; GFX8-NEXT:    v_mov_b32_e32 v5, s14
2257; GFX8-NEXT:    v_mov_b32_e32 v6, s13
2258; GFX8-NEXT:    v_mov_b32_e32 v7, s12
2259; GFX8-NEXT:    v_mov_b32_e32 v8, s11
2260; GFX8-NEXT:    v_mov_b32_e32 v9, s10
2261; GFX8-NEXT:    s_waitcnt vmcnt(0)
2262; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
2263; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
2264; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2265; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
2266; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
2267; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
2268; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
2269; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
2270; GFX8-NEXT:    v_mov_b32_e32 v3, s9
2271; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
2272; GFX8-NEXT:    flat_store_short v[0:1], v2
2273; GFX8-NEXT:    s_endpgm
2274;
2275; GFX9-LABEL: udot8_acc16_vecMul:
2276; GFX9:       ; %bb.0: ; %entry
2277; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2278; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2279; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2280; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2281; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2282; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2283; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
2284; GFX9-NEXT:    global_load_ushort v5, v0, s[0:1]
2285; GFX9-NEXT:    s_mov_b32 s22, -1
2286; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
2287; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
2288; GFX9-NEXT:    s_add_u32 s20, s20, s3
2289; GFX9-NEXT:    s_addc_u32 s21, s21, 0
2290; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2291; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x40018
2292; GFX9-NEXT:    s_lshr_b32 s12, s6, 28
2293; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s12
2294; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x40018
2295; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
2296; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40010
2297; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
2298; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
2299; GFX9-NEXT:    v_mov_b32_e32 v1, s7
2300; GFX9-NEXT:    v_pk_mul_lo_u16 v1, s3, v1
2301; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s13, s14
2302; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40008
2303; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
2304; GFX9-NEXT:    s_and_b32 s17, s6, 15
2305; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2306; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s15, s16
2307; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
2308; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2309; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s17, s6
2310; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40010
2311; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
2312; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
2313; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
2314; GFX9-NEXT:    s_and_b32 s11, s2, 15
2315; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40004
2316; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
2317; GFX9-NEXT:    v_mov_b32_e32 v4, s3
2318; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s8
2319; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s2, v4
2320; GFX9-NEXT:    v_pk_mul_lo_u16 v2, s4, v2
2321; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s9, s10
2322; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s4, v3
2323; GFX9-NEXT:    s_waitcnt vmcnt(0)
2324; GFX9-NEXT:    v_add_u32_e32 v5, v4, v5
2325; GFX9-NEXT:    v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2326; GFX9-NEXT:    v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
2327; GFX9-NEXT:    v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2328; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
2329; GFX9-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2330; GFX9-NEXT:    v_add_u32_e32 v2, v2, v1
2331; GFX9-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2332; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
2333; GFX9-NEXT:    s_endpgm
2334;
2335; GFX9-DL-LABEL: udot8_acc16_vecMul:
2336; GFX9-DL:       ; %bb.0: ; %entry
2337; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2338; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2339; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2340; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2341; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2342; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2343; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
2344; GFX9-DL-NEXT:    global_load_ushort v5, v0, s[0:1]
2345; GFX9-DL-NEXT:    s_mov_b32 s22, -1
2346; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
2347; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
2348; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
2349; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
2350; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2351; GFX9-DL-NEXT:    s_bfe_u32 s7, s6, 0x40018
2352; GFX9-DL-NEXT:    s_lshr_b32 s12, s6, 28
2353; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s12
2354; GFX9-DL-NEXT:    s_bfe_u32 s3, s2, 0x40018
2355; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
2356; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40010
2357; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40014
2358; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
2359; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
2360; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, s3, v1
2361; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s13, s14
2362; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40008
2363; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
2364; GFX9-DL-NEXT:    s_and_b32 s17, s6, 15
2365; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
2366; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s15, s16
2367; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
2368; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
2369; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s17, s6
2370; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40010
2371; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
2372; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
2373; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x4000c
2374; GFX9-DL-NEXT:    s_and_b32 s11, s2, 15
2375; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40004
2376; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
2377; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
2378; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s5, s8
2379; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s2, v4
2380; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, s4, v2
2381; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s9, s10
2382; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s4, v3
2383; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2384; GFX9-DL-NEXT:    v_add_u32_e32 v5, v4, v5
2385; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2386; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
2387; GFX9-DL-NEXT:    v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2388; GFX9-DL-NEXT:    v_add_u32_e32 v3, v3, v2
2389; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2390; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v1
2391; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2392; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
2393; GFX9-DL-NEXT:    s_endpgm
2394;
2395; GFX10-DL-LABEL: udot8_acc16_vecMul:
2396; GFX10-DL:       ; %bb.0: ; %entry
2397; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2398; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2399; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2400; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2401; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2402; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2403; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2404; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2405; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2406; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2407; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2408; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2409; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2410; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2411; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
2412; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x40004
2413; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
2414; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
2415; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
2416; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
2417; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
2418; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, s2, s3
2419; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
2420; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
2421; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
2422; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
2423; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
2424; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x40014
2425; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s6
2426; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
2427; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40010
2428; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40014
2429; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
2430; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
2431; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
2432; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
2433; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s3, s1
2434; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2435; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
2436; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2437; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, s2, s6
2438; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
2439; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
2440; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
2441; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
2442; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2443; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s0, s1
2444; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2445; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2446; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2447; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2448; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
2449; GFX10-DL-NEXT:    s_endpgm
2450                                              <8 x i4> addrspace(1)* %src2,
2451                                              i16 addrspace(1)* nocapture %dst) {
2452entry:
2453  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
2454  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
2455
2456  %cvec1 = zext <8 x i4> %vec1 to <8 x i16>
2457  %cvec2 = zext <8 x i4> %vec2 to <8 x i16>
2458
2459  %mul = mul <8 x i16> %cvec1, %cvec2
2460  %mul0 = extractelement <8 x i16> %mul, i64 0
2461  %mul1 = extractelement <8 x i16> %mul, i64 1
2462  %mul2 = extractelement <8 x i16> %mul, i64 2
2463  %mul3 = extractelement <8 x i16> %mul, i64 3
2464  %mul4 = extractelement <8 x i16> %mul, i64 4
2465  %mul5 = extractelement <8 x i16> %mul, i64 5
2466  %mul6 = extractelement <8 x i16> %mul, i64 6
2467  %mul7 = extractelement <8 x i16> %mul, i64 7
2468
2469  %acc = load i16, i16 addrspace(1)* %dst, align 4
2470  %add1 = add i16 %mul0, %acc
2471  %add2 = add i16 %add1, %mul1
2472  %add3 = add i16 %add2, %mul2
2473  %add4 = add i16 %add3, %mul3
2474  %add5 = add i16 %add4, %mul4
2475  %add6 = add i16 %add5, %mul5
2476  %add7 = add i16 %add6, %mul6
2477  %add8 = add i16 %add7, %mul7
2478
2479  store i16 %add8, i16 addrspace(1)* %dst, align 4
2480  ret void
2481}
2482
2483; TODO: Cleanup the code to generate MAD; pattern should be recognized then.
2484define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2485; GFX7-LABEL: udot8_acc8_vecMul:
2486; GFX7:       ; %bb.0: ; %entry
2487; GFX7-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2488; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2489; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2490; GFX7-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2491; GFX7-NEXT:    s_mov_b32 s22, -1
2492; GFX7-NEXT:    s_mov_b32 s23, 0xe8f000
2493; GFX7-NEXT:    s_add_u32 s20, s20, s3
2494; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2495; GFX7-NEXT:    s_mov_b32 s2, -1
2496; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2497; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
2498; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
2499; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
2500; GFX7-NEXT:    s_addc_u32 s21, s21, 0
2501; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2502; GFX7-NEXT:    s_bfe_u32 s6, s4, 0x4000c
2503; GFX7-NEXT:    s_bfe_u32 s13, s5, 0x4000c
2504; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40004
2505; GFX7-NEXT:    s_lshr_b32 s17, s5, 28
2506; GFX7-NEXT:    v_mov_b32_e32 v8, s13
2507; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40008
2508; GFX7-NEXT:    s_and_b32 s16, s5, 15
2509; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40018
2510; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40014
2511; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40004
2512; GFX7-NEXT:    v_mov_b32_e32 v6, s15
2513; GFX7-NEXT:    s_lshr_b32 s10, s4, 28
2514; GFX7-NEXT:    v_mov_b32_e32 v4, s17
2515; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s10, v4
2516; GFX7-NEXT:    v_mul_u32_u24_e32 v6, s8, v6
2517; GFX7-NEXT:    v_mul_u32_u24_e32 v8, s6, v8
2518; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x40010
2519; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40008
2520; GFX7-NEXT:    v_mov_b32_e32 v7, s14
2521; GFX7-NEXT:    s_and_b32 s9, s4, 15
2522; GFX7-NEXT:    v_mov_b32_e32 v5, s16
2523; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40018
2524; GFX7-NEXT:    v_mov_b32_e32 v3, s18
2525; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40014
2526; GFX7-NEXT:    v_mov_b32_e32 v2, s19
2527; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s12, v2
2528; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x40010
2529; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2530; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s11, v3
2531; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
2532; GFX7-NEXT:    v_mul_u32_u24_e32 v5, s9, v5
2533; GFX7-NEXT:    v_mul_u32_u24_e32 v7, s7, v7
2534; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
2535; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
2536; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
2537; GFX7-NEXT:    v_or_b32_e32 v4, v5, v6
2538; GFX7-NEXT:    v_or_b32_e32 v5, v7, v8
2539; GFX7-NEXT:    v_mul_u32_u24_e32 v9, s4, v1
2540; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
2541; GFX7-NEXT:    v_or_b32_e32 v2, v9, v2
2542; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2543; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2544; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
2545; GFX7-NEXT:    v_or_b32_e32 v3, v4, v5
2546; GFX7-NEXT:    v_alignbit_b32 v4, v2, v3, 8
2547; GFX7-NEXT:    v_alignbit_b32 v5, v2, v3, 16
2548; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
2549; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
2550; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
2551; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
2552; GFX7-NEXT:    s_waitcnt vmcnt(0)
2553; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
2554; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
2555; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
2556; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
2557; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
2558; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
2559; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
2560; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2561; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2562; GFX7-NEXT:    s_endpgm
2563;
2564; GFX8-LABEL: udot8_acc8_vecMul:
2565; GFX8:       ; %bb.0: ; %entry
2566; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2567; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2568; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2569; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2570; GFX8-NEXT:    s_mov_b32 s22, -1
2571; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
2572; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2573; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2574; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2575; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
2576; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
2577; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
2578; GFX8-NEXT:    s_add_u32 s20, s20, s3
2579; GFX8-NEXT:    s_addc_u32 s21, s21, 0
2580; GFX8-NEXT:    s_mov_b32 s0, 0xffff
2581; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2582; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x40004
2583; GFX8-NEXT:    s_bfe_u32 s9, s1, 0x4000c
2584; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40004
2585; GFX8-NEXT:    s_and_b32 s15, s2, 15
2586; GFX8-NEXT:    s_bfe_u32 s16, s2, 0x4000c
2587; GFX8-NEXT:    s_bfe_u32 s3, s1, 0x40014
2588; GFX8-NEXT:    s_lshr_b32 s5, s1, 28
2589; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40014
2590; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40010
2591; GFX8-NEXT:    s_lshr_b32 s12, s2, 28
2592; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40018
2593; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40008
2594; GFX8-NEXT:    s_and_b32 s8, s1, 15
2595; GFX8-NEXT:    v_mov_b32_e32 v4, s16
2596; GFX8-NEXT:    v_mov_b32_e32 v5, s9
2597; GFX8-NEXT:    v_mov_b32_e32 v6, s15
2598; GFX8-NEXT:    v_mov_b32_e32 v7, s14
2599; GFX8-NEXT:    v_mov_b32_e32 v8, s7
2600; GFX8-NEXT:    v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2601; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v6
2602; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2603; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x40010
2604; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x40018
2605; GFX8-NEXT:    v_mov_b32_e32 v9, s13
2606; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x40008
2607; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2608; GFX8-NEXT:    v_mov_b32_e32 v10, s12
2609; GFX8-NEXT:    v_mov_b32_e32 v11, s5
2610; GFX8-NEXT:    v_mov_b32_e32 v12, s11
2611; GFX8-NEXT:    v_mov_b32_e32 v13, s10
2612; GFX8-NEXT:    v_mov_b32_e32 v14, s3
2613; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s1, v3
2614; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
2615; GFX8-NEXT:    v_mul_u32_u24_e32 v7, s6, v9
2616; GFX8-NEXT:    v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2617; GFX8-NEXT:    v_mul_u32_u24_e32 v9, s4, v12
2618; GFX8-NEXT:    v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2619; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
2620; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2621; GFX8-NEXT:    v_or_b32_e32 v9, v9, v10
2622; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2623; GFX8-NEXT:    v_and_b32_e32 v4, s0, v9
2624; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
2625; GFX8-NEXT:    v_or_b32_e32 v6, v4, v7
2626; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
2627; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v6
2628; GFX8-NEXT:    s_waitcnt vmcnt(0)
2629; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
2630; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
2631; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
2632; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2633; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2634; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2635; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2636; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2637; GFX8-NEXT:    flat_store_byte v[0:1], v2
2638; GFX8-NEXT:    s_endpgm
2639;
2640; GFX9-LABEL: udot8_acc8_vecMul:
2641; GFX9:       ; %bb.0: ; %entry
2642; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2643; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2644; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2645; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2646; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2647; GFX9-NEXT:    s_mov_b32 s22, -1
2648; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2649; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
2650; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
2651; GFX9-NEXT:    s_add_u32 s20, s20, s3
2652; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
2653; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
2654; GFX9-NEXT:    s_addc_u32 s21, s21, 0
2655; GFX9-NEXT:    s_mov_b32 s2, 0xffff
2656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2657; GFX9-NEXT:    s_bfe_u32 s5, s3, 0x40010
2658; GFX9-NEXT:    s_bfe_u32 s12, s4, 0x40010
2659; GFX9-NEXT:    s_bfe_u32 s13, s4, 0x40014
2660; GFX9-NEXT:    s_bfe_u32 s14, s4, 0x40018
2661; GFX9-NEXT:    s_lshr_b32 s15, s4, 28
2662; GFX9-NEXT:    s_and_b32 s16, s4, 15
2663; GFX9-NEXT:    s_bfe_u32 s17, s4, 0x40004
2664; GFX9-NEXT:    s_bfe_u32 s18, s4, 0x40008
2665; GFX9-NEXT:    v_mov_b32_e32 v2, s12
2666; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x4000c
2667; GFX9-NEXT:    s_bfe_u32 s6, s3, 0x40014
2668; GFX9-NEXT:    v_mov_b32_e32 v3, s13
2669; GFX9-NEXT:    s_bfe_u32 s7, s3, 0x40018
2670; GFX9-NEXT:    v_mov_b32_e32 v4, s14
2671; GFX9-NEXT:    s_lshr_b32 s8, s3, 28
2672; GFX9-NEXT:    v_mov_b32_e32 v5, s15
2673; GFX9-NEXT:    s_and_b32 s9, s3, 15
2674; GFX9-NEXT:    v_mov_b32_e32 v6, s16
2675; GFX9-NEXT:    s_bfe_u32 s10, s3, 0x40004
2676; GFX9-NEXT:    v_mov_b32_e32 v7, s17
2677; GFX9-NEXT:    s_bfe_u32 s11, s3, 0x40008
2678; GFX9-NEXT:    v_mov_b32_e32 v8, s18
2679; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x4000c
2680; GFX9-NEXT:    v_mov_b32_e32 v9, s4
2681; GFX9-NEXT:    v_mul_lo_u16_e32 v2, s5, v2
2682; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2683; GFX9-NEXT:    v_mul_lo_u16_e32 v4, s7, v4
2684; GFX9-NEXT:    v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2685; GFX9-NEXT:    v_mul_lo_u16_e32 v6, s9, v6
2686; GFX9-NEXT:    v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2687; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
2688; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2689; GFX9-NEXT:    v_or_b32_e32 v4, v6, v7
2690; GFX9-NEXT:    v_mul_lo_u16_e32 v8, s11, v8
2691; GFX9-NEXT:    v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2692; GFX9-NEXT:    v_and_b32_e32 v4, s2, v4
2693; GFX9-NEXT:    v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2694; GFX9-NEXT:    v_or_b32_e32 v5, v4, v5
2695; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v5
2696; GFX9-NEXT:    v_and_b32_e32 v2, s2, v2
2697; GFX9-NEXT:    v_or_b32_e32 v3, v2, v3
2698; GFX9-NEXT:    s_waitcnt vmcnt(0)
2699; GFX9-NEXT:    v_add_u32_e32 v1, v4, v1
2700; GFX9-NEXT:    v_add_u32_e32 v1, v1, v6
2701; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2702; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2703; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
2704; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
2705; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
2706; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2707; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2708; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
2709; GFX9-NEXT:    s_endpgm
2710;
2711; GFX9-DL-LABEL: udot8_acc8_vecMul:
2712; GFX9-DL:       ; %bb.0: ; %entry
2713; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2714; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2715; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2716; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2717; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2718; GFX9-DL-NEXT:    s_mov_b32 s22, -1
2719; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2720; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
2721; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
2722; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
2723; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
2724; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
2725; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
2726; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
2727; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2728; GFX9-DL-NEXT:    s_bfe_u32 s5, s3, 0x40010
2729; GFX9-DL-NEXT:    s_bfe_u32 s12, s4, 0x40010
2730; GFX9-DL-NEXT:    s_bfe_u32 s13, s4, 0x40014
2731; GFX9-DL-NEXT:    s_bfe_u32 s14, s4, 0x40018
2732; GFX9-DL-NEXT:    s_lshr_b32 s15, s4, 28
2733; GFX9-DL-NEXT:    s_and_b32 s16, s4, 15
2734; GFX9-DL-NEXT:    s_bfe_u32 s17, s4, 0x40004
2735; GFX9-DL-NEXT:    s_bfe_u32 s18, s4, 0x40008
2736; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s12
2737; GFX9-DL-NEXT:    s_bfe_u32 s4, s4, 0x4000c
2738; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x40014
2739; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s13
2740; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x40018
2741; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s14
2742; GFX9-DL-NEXT:    s_lshr_b32 s8, s3, 28
2743; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
2744; GFX9-DL-NEXT:    s_and_b32 s9, s3, 15
2745; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s16
2746; GFX9-DL-NEXT:    s_bfe_u32 s10, s3, 0x40004
2747; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s17
2748; GFX9-DL-NEXT:    s_bfe_u32 s11, s3, 0x40008
2749; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s18
2750; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x4000c
2751; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s4
2752; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v2, s5, v2
2753; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2754; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v4, s7, v4
2755; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2756; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v6, s9, v6
2757; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2758; GFX9-DL-NEXT:    v_or_b32_e32 v2, v2, v3
2759; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2760; GFX9-DL-NEXT:    v_or_b32_e32 v4, v6, v7
2761; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, s11, v8
2762; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2763; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v4
2764; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2765; GFX9-DL-NEXT:    v_or_b32_e32 v5, v4, v5
2766; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v5
2767; GFX9-DL-NEXT:    v_and_b32_e32 v2, s2, v2
2768; GFX9-DL-NEXT:    v_or_b32_e32 v3, v2, v3
2769; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2770; GFX9-DL-NEXT:    v_add_u32_e32 v1, v4, v1
2771; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v6
2772; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2773; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2774; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
2775; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
2776; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
2777; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2778; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2779; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
2780; GFX9-DL-NEXT:    s_endpgm
2781;
2782; GFX10-DL-LABEL: udot8_acc8_vecMul:
2783; GFX10-DL:       ; %bb.0: ; %entry
2784; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2785; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2786; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2787; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2788; GFX10-DL-NEXT:    s_mov_b32 s14, -1
2789; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
2790; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
2791; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2792; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
2793; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2794; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
2795; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2796; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2797; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2798; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x40004
2799; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40004
2800; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
2801; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v2, s3, s7
2802; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
2803; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
2804; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
2805; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s2, s3
2806; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s8, s7
2807; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 8, v2
2808; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
2809; GFX10-DL-NEXT:    s_bfe_u32 s2, s1, 0x40008
2810; GFX10-DL-NEXT:    s_mov_b32 s3, 0xffff
2811; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s6, s2
2812; GFX10-DL-NEXT:    v_or_b32_e32 v2, v3, v2
2813; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v4
2814; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40014
2815; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40014
2816; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
2817; GFX10-DL-NEXT:    v_and_b32_e32 v2, s3, v2
2818; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2819; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s6, s8
2820; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x40018
2821; GFX10-DL-NEXT:    s_bfe_u32 s9, s1, 0x40010
2822; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
2823; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v3
2824; GFX10-DL-NEXT:    s_lshr_b32 s6, s1, 28
2825; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s2, s9
2826; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s0, s6
2827; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v4
2828; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
2829; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40018
2830; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v11, s7, s0
2831; GFX10-DL-NEXT:    v_or_b32_e32 v4, v5, v4
2832; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v6
2833; GFX10-DL-NEXT:    v_and_b32_e32 v4, s3, v4
2834; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2835; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
2836; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2837; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v7
2838; GFX10-DL-NEXT:    v_or_b32_e32 v2, v4, v2
2839; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2840; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2841; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
2842; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v4
2843; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2844; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2845; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2846; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
2847; GFX10-DL-NEXT:    s_endpgm
2848                                             <8 x i4> addrspace(1)* %src2,
2849                                             i8 addrspace(1)* nocapture %dst) {
2850entry:
2851  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
2852  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
2853
2854  %cvec1 = zext <8 x i4> %vec1 to <8 x i8>
2855  %cvec2 = zext <8 x i4> %vec2 to <8 x i8>
2856
2857  %mul = mul <8 x i8> %cvec1, %cvec2
2858  %mul0 = extractelement <8 x i8> %mul, i64 0
2859  %mul1 = extractelement <8 x i8> %mul, i64 1
2860  %mul2 = extractelement <8 x i8> %mul, i64 2
2861  %mul3 = extractelement <8 x i8> %mul, i64 3
2862  %mul4 = extractelement <8 x i8> %mul, i64 4
2863  %mul5 = extractelement <8 x i8> %mul, i64 5
2864  %mul6 = extractelement <8 x i8> %mul, i64 6
2865  %mul7 = extractelement <8 x i8> %mul, i64 7
2866
2867  %acc = load i8, i8 addrspace(1)* %dst, align 4
2868  %add1 = add i8 %mul0, %acc
2869  %add2 = add i8 %add1, %mul1
2870  %add3 = add i8 %add2, %mul2
2871  %add4 = add i8 %add3, %mul3
2872  %add5 = add i8 %add4, %mul4
2873  %add6 = add i8 %add5, %mul5
2874  %add7 = add i8 %add6, %mul6
2875  %add8 = add i8 %add7, %mul7
2876
2877  store i8 %add8, i8 addrspace(1)* %dst, align 4
2878  ret void
2879}
2880
2881; TODO: Once the adictional "and+add" are removed, the pattern will be recognized.
2882define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
2883; GFX7-LABEL: udot8_acc4_vecMul:
2884; GFX7:       ; %bb.0: ; %entry
2885; GFX7-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2886; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2887; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2888; GFX7-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2889; GFX7-NEXT:    s_mov_b32 s22, -1
2890; GFX7-NEXT:    s_mov_b32 s23, 0xe8f000
2891; GFX7-NEXT:    s_add_u32 s20, s20, s3
2892; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2893; GFX7-NEXT:    s_mov_b32 s2, -1
2894; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2895; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
2896; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
2897; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
2898; GFX7-NEXT:    s_addc_u32 s21, s21, 0
2899; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2900; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
2901; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
2902; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
2903; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
2904; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
2905; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
2906; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
2907; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
2908; GFX7-NEXT:    s_and_b32 s5, s5, 15
2909; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
2910; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
2911; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
2912; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
2913; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
2914; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
2915; GFX7-NEXT:    s_and_b32 s4, s4, 15
2916; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2917; GFX7-NEXT:    v_mov_b32_e32 v2, s19
2918; GFX7-NEXT:    v_mov_b32_e32 v3, s18
2919; GFX7-NEXT:    v_mov_b32_e32 v4, s17
2920; GFX7-NEXT:    v_mov_b32_e32 v5, s16
2921; GFX7-NEXT:    v_mov_b32_e32 v6, s15
2922; GFX7-NEXT:    v_mov_b32_e32 v7, s14
2923; GFX7-NEXT:    s_waitcnt vmcnt(0)
2924; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
2925; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
2926; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
2927; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
2928; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
2929; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
2930; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
2931; GFX7-NEXT:    v_mov_b32_e32 v1, s13
2932; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
2933; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2934; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2935; GFX7-NEXT:    s_endpgm
2936;
2937; GFX8-LABEL: udot8_acc4_vecMul:
2938; GFX8:       ; %bb.0: ; %entry
2939; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2940; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2941; GFX8-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
2942; GFX8-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
2943; GFX8-NEXT:    s_mov_b32 s18, -1
2944; GFX8-NEXT:    s_mov_b32 s19, 0xe80000
2945; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2946; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2947; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2948; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
2949; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2950; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
2951; GFX8-NEXT:    s_add_u32 s16, s16, s3
2952; GFX8-NEXT:    s_addc_u32 s17, s17, 0
2953; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2954; GFX8-NEXT:    s_and_b32 s8, s0, 15
2955; GFX8-NEXT:    s_and_b32 s15, s1, 15
2956; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40004
2957; GFX8-NEXT:    v_mov_b32_e32 v4, s15
2958; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
2959; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
2960; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
2961; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40008
2962; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
2963; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
2964; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40004
2965; GFX8-NEXT:    v_mov_b32_e32 v5, s14
2966; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
2967; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
2968; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
2969; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
2970; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
2971; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
2972; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2973; GFX8-NEXT:    v_mov_b32_e32 v6, s13
2974; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
2975; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
2976; GFX8-NEXT:    v_mov_b32_e32 v7, s12
2977; GFX8-NEXT:    v_mov_b32_e32 v8, s11
2978; GFX8-NEXT:    v_mov_b32_e32 v9, s10
2979; GFX8-NEXT:    s_waitcnt vmcnt(0)
2980; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
2981; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
2982; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
2983; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
2984; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
2985; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
2986; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
2987; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
2988; GFX8-NEXT:    v_mov_b32_e32 v3, s9
2989; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
2990; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
2991; GFX8-NEXT:    flat_store_byte v[0:1], v2
2992; GFX8-NEXT:    s_endpgm
2993;
2994; GFX9-LABEL: udot8_acc4_vecMul:
2995; GFX9:       ; %bb.0: ; %entry
2996; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2997; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2998; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2999; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
3000; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
3001; GFX9-NEXT:    s_mov_b32 s22, -1
3002; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3003; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
3004; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
3005; GFX9-NEXT:    s_add_u32 s20, s20, s3
3006; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
3007; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
3008; GFX9-NEXT:    s_addc_u32 s21, s21, 0
3009; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3010; GFX9-NEXT:    s_and_b32 s10, s2, 15
3011; GFX9-NEXT:    s_and_b32 s17, s3, 15
3012; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40004
3013; GFX9-NEXT:    v_mov_b32_e32 v3, s17
3014; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
3015; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
3016; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
3017; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40008
3018; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
3019; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x4000c
3020; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40004
3021; GFX9-NEXT:    v_mov_b32_e32 v4, s16
3022; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
3023; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
3024; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
3025; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
3026; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
3027; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x4000c
3028; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3029; GFX9-NEXT:    v_mov_b32_e32 v5, s15
3030; GFX9-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
3031; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
3032; GFX9-NEXT:    v_mov_b32_e32 v6, s14
3033; GFX9-NEXT:    v_mov_b32_e32 v7, s13
3034; GFX9-NEXT:    v_mov_b32_e32 v8, s12
3035; GFX9-NEXT:    s_waitcnt vmcnt(0)
3036; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
3037; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
3038; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
3039; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
3040; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
3041; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
3042; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
3043; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
3044; GFX9-NEXT:    v_mov_b32_e32 v2, s11
3045; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
3046; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
3047; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
3048; GFX9-NEXT:    s_endpgm
3049;
3050; GFX9-DL-LABEL: udot8_acc4_vecMul:
3051; GFX9-DL:       ; %bb.0: ; %entry
3052; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3053; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3054; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3055; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
3056; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
3057; GFX9-DL-NEXT:    s_mov_b32 s22, -1
3058; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3059; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
3060; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
3061; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
3062; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
3063; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
3064; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
3065; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3066; GFX9-DL-NEXT:    s_and_b32 s10, s2, 15
3067; GFX9-DL-NEXT:    s_and_b32 s17, s3, 15
3068; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40004
3069; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
3070; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
3071; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
3072; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
3073; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x40008
3074; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
3075; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x4000c
3076; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40004
3077; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
3078; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
3079; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
3080; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
3081; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
3082; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
3083; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x4000c
3084; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
3085; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
3086; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
3087; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
3088; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
3089; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
3090; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
3091; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3092; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
3093; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
3094; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
3095; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3096; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
3097; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
3098; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
3099; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
3100; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
3101; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
3102; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3103; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
3104; GFX9-DL-NEXT:    s_endpgm
3105;
3106; GFX10-DL-LABEL: udot8_acc4_vecMul:
3107; GFX10-DL:       ; %bb.0: ; %entry
3108; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
3109; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3110; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3111; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3112; GFX10-DL-NEXT:    s_mov_b32 s10, -1
3113; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
3114; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
3115; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3116; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
3117; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3118; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
3119; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
3120; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
3121; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3122; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
3123; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
3124; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
3125; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
3126; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3127; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
3128; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
3129; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
3130; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
3131; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
3132; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
3133; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v2, s3, s7
3134; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s6, v1
3135; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
3136; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
3137; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
3138; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3139; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
3140; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
3141; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
3142; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
3143; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
3144; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
3145; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
3146; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
3147; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
3148; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
3149; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
3150; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3151; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
3152; GFX10-DL-NEXT:    s_endpgm
3153                                             <8 x i4> addrspace(1)* %src2,
3154                                             i4 addrspace(1)* nocapture %dst) {
3155entry:
3156  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
3157  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
3158
3159  %mul = mul <8 x i4> %vec1, %vec2
3160  %mul0 = extractelement <8 x i4> %mul, i64 0
3161  %mul1 = extractelement <8 x i4> %mul, i64 1
3162  %mul2 = extractelement <8 x i4> %mul, i64 2
3163  %mul3 = extractelement <8 x i4> %mul, i64 3
3164  %mul4 = extractelement <8 x i4> %mul, i64 4
3165  %mul5 = extractelement <8 x i4> %mul, i64 5
3166  %mul6 = extractelement <8 x i4> %mul, i64 6
3167  %mul7 = extractelement <8 x i4> %mul, i64 7
3168
3169  %acc = load i4, i4 addrspace(1)* %dst, align 4
3170  %add1 = add i4 %mul0, %acc
3171  %add2 = add i4 %add1, %mul1
3172  %add3 = add i4 %add2, %mul2
3173  %add4 = add i4 %add3, %mul3
3174  %add5 = add i4 %add4, %mul4
3175  %add6 = add i4 %add5, %mul5
3176  %add7 = add i4 %add6, %mul6
3177  %add8 = add i4 %add7, %mul7
3178
3179  store i4 %add8, i4 addrspace(1)* %dst, align 4
3180  ret void
3181}
3182
3183define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
3184; GFX7-LABEL: udot8_variant1:
3185; GFX7:       ; %bb.0: ; %entry
3186; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3187; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3188; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3189; GFX7-NEXT:    s_mov_b32 s2, -1
3190; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3191; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
3192; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
3193; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
3194; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3195; GFX7-NEXT:    s_and_b32 s6, s4, 15
3196; GFX7-NEXT:    s_and_b32 s7, s5, 15
3197; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40004
3198; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40008
3199; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x4000c
3200; GFX7-NEXT:    s_bfe_u32 s14, s4, 0x40010
3201; GFX7-NEXT:    s_bfe_u32 s16, s4, 0x40014
3202; GFX7-NEXT:    s_bfe_u32 s18, s4, 0x40018
3203; GFX7-NEXT:    s_lshr_b32 s4, s4, 28
3204; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3205; GFX7-NEXT:    v_mov_b32_e32 v1, s20
3206; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
3207; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x40004
3208; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x40008
3209; GFX7-NEXT:    s_bfe_u32 s13, s5, 0x4000c
3210; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40010
3211; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x40014
3212; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40018
3213; GFX7-NEXT:    s_lshr_b32 s5, s5, 28
3214; GFX7-NEXT:    v_mov_b32_e32 v1, s4
3215; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
3216; GFX7-NEXT:    v_mov_b32_e32 v1, s8
3217; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
3218; GFX7-NEXT:    v_mov_b32_e32 v1, s10
3219; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
3220; GFX7-NEXT:    v_mov_b32_e32 v1, s12
3221; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
3222; GFX7-NEXT:    v_mov_b32_e32 v1, s14
3223; GFX7-NEXT:    v_mad_u32_u24 v0, s15, v1, v0
3224; GFX7-NEXT:    v_mov_b32_e32 v1, s16
3225; GFX7-NEXT:    v_mad_u32_u24 v0, s17, v1, v0
3226; GFX7-NEXT:    v_mov_b32_e32 v1, s18
3227; GFX7-NEXT:    v_mad_u32_u24 v0, s19, v1, v0
3228; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3229; GFX7-NEXT:    s_endpgm
3230;
3231; GFX8-LABEL: udot8_variant1:
3232; GFX8:       ; %bb.0: ; %entry
3233; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3234; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3236; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
3237; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
3238; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
3239; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3240; GFX8-NEXT:    s_and_b32 s4, s2, 15
3241; GFX8-NEXT:    s_and_b32 s5, s3, 15
3242; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40004
3243; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40008
3244; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x4000c
3245; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40010
3246; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40014
3247; GFX8-NEXT:    s_bfe_u32 s16, s2, 0x40018
3248; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
3249; GFX8-NEXT:    v_mov_b32_e32 v0, s4
3250; GFX8-NEXT:    v_mov_b32_e32 v1, s18
3251; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
3252; GFX8-NEXT:    s_bfe_u32 s7, s3, 0x40004
3253; GFX8-NEXT:    s_bfe_u32 s9, s3, 0x40008
3254; GFX8-NEXT:    s_bfe_u32 s11, s3, 0x4000c
3255; GFX8-NEXT:    s_bfe_u32 s13, s3, 0x40010
3256; GFX8-NEXT:    s_bfe_u32 s15, s3, 0x40014
3257; GFX8-NEXT:    s_bfe_u32 s17, s3, 0x40018
3258; GFX8-NEXT:    s_lshr_b32 s3, s3, 28
3259; GFX8-NEXT:    v_mov_b32_e32 v1, s2
3260; GFX8-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
3261; GFX8-NEXT:    v_mov_b32_e32 v1, s6
3262; GFX8-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
3263; GFX8-NEXT:    v_mov_b32_e32 v1, s8
3264; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
3265; GFX8-NEXT:    v_mov_b32_e32 v1, s10
3266; GFX8-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
3267; GFX8-NEXT:    v_mov_b32_e32 v1, s12
3268; GFX8-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
3269; GFX8-NEXT:    v_mov_b32_e32 v1, s14
3270; GFX8-NEXT:    v_mad_u32_u24 v0, s15, v1, v0
3271; GFX8-NEXT:    v_mov_b32_e32 v1, s16
3272; GFX8-NEXT:    v_mad_u32_u24 v2, s17, v1, v0
3273; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3274; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3275; GFX8-NEXT:    flat_store_dword v[0:1], v2
3276; GFX8-NEXT:    s_endpgm
3277;
3278; GFX9-LABEL: udot8_variant1:
3279; GFX9:       ; %bb.0: ; %entry
3280; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3281; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3282; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3283; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3284; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
3285; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
3286; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
3287; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3288; GFX9-NEXT:    s_and_b32 s4, s2, 15
3289; GFX9-NEXT:    s_and_b32 s5, s3, 15
3290; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40004
3291; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
3292; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
3293; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40010
3294; GFX9-NEXT:    s_bfe_u32 s14, s2, 0x40014
3295; GFX9-NEXT:    s_bfe_u32 s16, s2, 0x40018
3296; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
3297; GFX9-NEXT:    v_mov_b32_e32 v1, s4
3298; GFX9-NEXT:    v_mov_b32_e32 v2, s18
3299; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
3300; GFX9-NEXT:    s_bfe_u32 s7, s3, 0x40004
3301; GFX9-NEXT:    s_bfe_u32 s9, s3, 0x40008
3302; GFX9-NEXT:    s_bfe_u32 s11, s3, 0x4000c
3303; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40010
3304; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40014
3305; GFX9-NEXT:    s_bfe_u32 s17, s3, 0x40018
3306; GFX9-NEXT:    s_lshr_b32 s3, s3, 28
3307; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3308; GFX9-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
3309; GFX9-NEXT:    v_mov_b32_e32 v2, s6
3310; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
3311; GFX9-NEXT:    v_mov_b32_e32 v2, s8
3312; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
3313; GFX9-NEXT:    v_mov_b32_e32 v2, s10
3314; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
3315; GFX9-NEXT:    v_mov_b32_e32 v2, s12
3316; GFX9-NEXT:    v_mad_u32_u24 v1, s13, v2, v1
3317; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3318; GFX9-NEXT:    v_mad_u32_u24 v1, s15, v2, v1
3319; GFX9-NEXT:    v_mov_b32_e32 v2, s16
3320; GFX9-NEXT:    v_mad_u32_u24 v1, s17, v2, v1
3321; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3322; GFX9-NEXT:    s_endpgm
3323;
3324; GFX9-DL-LABEL: udot8_variant1:
3325; GFX9-DL:       ; %bb.0: ; %entry
3326; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3327; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3328; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3329; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3330; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
3331; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
3332; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
3333; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3334; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
3335; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
3336; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, s4, v1, v2
3337; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
3338; GFX9-DL-NEXT:    s_endpgm
3339;
3340; GFX10-DL-LABEL: udot8_variant1:
3341; GFX10-DL:       ; %bb.0: ; %entry
3342; GFX10-DL-NEXT:    s_clause 0x1
3343; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
3344; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3345; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
3346; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3347; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
3348; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
3349; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
3350; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3351; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
3352; GFX10-DL-NEXT:    v_dot8_u32_u4 v0, s1, s0, v0
3353; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
3354; GFX10-DL-NEXT:    s_endpgm
3355                                          i32 addrspace(1)* %v2addr,
3356                                          i32 addrspace(1)* %dst) {
3357entry:
3358  %v1 = load i32, i32 addrspace(1)* %v1addr, align 4
3359  %v2 = load i32, i32 addrspace(1)* %v2addr, align 4
3360  %and = and i32 %v1, 15
3361  %and1 = and i32 %v2, 15
3362  %mul1 = mul nuw nsw i32 %and1, %and
3363
3364  %shr = lshr i32 %v1, 4
3365  %and2 = and i32 %shr, 15
3366  %shr3 = lshr i32 %v2, 4
3367  %and4 = and i32 %shr3, 15
3368  %mul2 = mul nuw nsw i32 %and4, %and2
3369
3370  %shr6 = lshr i32 %v1, 8
3371  %and7 = and i32 %shr6, 15
3372  %shr8 = lshr i32 %v2, 8
3373  %and9 = and i32 %shr8, 15
3374  %mul3 = mul nuw nsw i32 %and9, %and7
3375
3376  %shr12 = lshr i32 %v1, 12
3377  %and13 = and i32 %shr12, 15
3378  %shr14 = lshr i32 %v2, 12
3379  %and15 = and i32 %shr14, 15
3380  %mul4 = mul nuw nsw i32 %and15, %and13
3381
3382  %shr18 = lshr i32 %v1, 16
3383  %and19 = and i32 %shr18, 15
3384  %shr20 = lshr i32 %v2, 16
3385  %and21 = and i32 %shr20, 15
3386  %mul5 = mul nuw nsw i32 %and21, %and19
3387
3388  %shr24 = lshr i32 %v1, 20
3389  %and25 = and i32 %shr24, 15
3390  %shr26 = lshr i32 %v2, 20
3391  %and27 = and i32 %shr26, 15
3392  %mul6 = mul nuw nsw i32 %and27, %and25
3393
3394  %shr30 = lshr i32 %v1, 24
3395  %and31 = and i32 %shr30, 15
3396  %shr32 = lshr i32 %v2, 24
3397  %and33 = and i32 %shr32, 15
3398  %mul7 = mul nuw nsw i32 %and33, %and31
3399
3400  %shr36 = lshr i32 %v1, 28
3401  %shr37 = lshr i32 %v2, 28
3402  %mul8 = mul nuw nsw i32 %shr37, %shr36
3403  %acc = load i32, i32 addrspace(1)* %dst, align 4
3404
3405  %add1 = add i32 %mul1, %acc
3406  %add2 = add i32 %add1, %mul8
3407  %add3 = add i32 %add2, %mul2
3408  %add4 = add i32 %add3, %mul3
3409  %add5 = add i32 %add4, %mul4
3410  %add6 = add i32 %add5, %mul5
3411  %add7 = add i32 %add6, %mul6
3412  %add8 = add i32 %add7, %mul7
3413  store i32 %add8, i32 addrspace(1)* %dst, align 4
3414  ret void
3415}
3416