1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
10; GFX7-LABEL: udot4_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
14; GFX7-NEXT:    s_movk_i32 s8, 0xff
15; GFX7-NEXT:    s_mov_b32 s3, 0xf000
16; GFX7-NEXT:    s_mov_b32 s2, -1
17; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
19; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
20; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
21; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22; GFX7-NEXT:    s_and_b32 s6, s4, s8
23; GFX7-NEXT:    s_and_b32 s7, s5, s8
24; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
25; GFX7-NEXT:    v_mov_b32_e32 v0, s7
26; GFX7-NEXT:    v_mov_b32_e32 v1, s12
27; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
28; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
29; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
30; GFX7-NEXT:    v_mov_b32_e32 v1, s9
31; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
32; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
33; GFX7-NEXT:    v_mov_b32_e32 v1, s11
34; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
35; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
36; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
37; GFX7-NEXT:    v_mov_b32_e32 v1, s5
38; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
39; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
40; GFX7-NEXT:    s_endpgm
41;
42; GFX8-LABEL: udot4_acc32:
43; GFX8:       ; %bb.0: ; %entry
44; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
45; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
46; GFX8-NEXT:    s_movk_i32 s2, 0xff
47; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
49; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
50; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
51; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX8-NEXT:    s_and_b32 s5, s3, s2
53; GFX8-NEXT:    s_and_b32 s2, s4, s2
54; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x80008
55; GFX8-NEXT:    v_mov_b32_e32 v0, s2
56; GFX8-NEXT:    v_mov_b32_e32 v1, s10
57; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x80010
58; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
59; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x80008
60; GFX8-NEXT:    v_mov_b32_e32 v1, s7
61; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x80010
62; GFX8-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
63; GFX8-NEXT:    v_mov_b32_e32 v1, s9
64; GFX8-NEXT:    s_lshr_b32 s4, s4, 24
65; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
66; GFX8-NEXT:    s_lshr_b32 s3, s3, 24
67; GFX8-NEXT:    v_mov_b32_e32 v1, s4
68; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
69; GFX8-NEXT:    v_mov_b32_e32 v0, s0
70; GFX8-NEXT:    v_mov_b32_e32 v1, s1
71; GFX8-NEXT:    flat_store_dword v[0:1], v2
72; GFX8-NEXT:    s_endpgm
73;
74; GFX9-NODL-LABEL: udot4_acc32:
75; GFX9-NODL:       ; %bb.0: ; %entry
76; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
77; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
78; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
79; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
80; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
82; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
83; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
84; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
86; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
87; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
88; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
89; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
90; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
91; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
92; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
93; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
94; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
95; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
96; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
97; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
98; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
99; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
100; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
101; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
102; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
103; GFX9-NODL-NEXT:    s_endpgm
104;
105; GFX9-DL-LABEL: udot4_acc32:
106; GFX9-DL:       ; %bb.0: ; %entry
107; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
108; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
109; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
110; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
112; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
113; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
114; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
116; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
117; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s4, v1, v2
118; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
119; GFX9-DL-NEXT:    s_endpgm
120;
121; GFX10-DL-LABEL: udot4_acc32:
122; GFX10-DL:       ; %bb.0: ; %entry
123; GFX10-DL-NEXT:    s_clause 0x1
124; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
125; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
126; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
127; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
129; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
130; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
131; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
132; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
133; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, s0, s1, v0
134; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
135; GFX10-DL-NEXT:    s_endpgm
136                                       <4 x i8> addrspace(1)* %src2,
137                                       i32 addrspace(1)* nocapture %dst) {
138entry:
139  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
140  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
141
142  %v1e0 = extractelement <4 x i8> %vec1, i64 0
143  %cv1e0 = zext i8 %v1e0 to i32
144  %v2e0 = extractelement <4 x i8> %vec2, i64 0
145  %cv2e0 = zext i8 %v2e0 to i32
146  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
147
148  %v1e1 = extractelement <4 x i8> %vec1, i64 1
149  %cv1e1 = zext i8 %v1e1 to i32
150  %v2e1 = extractelement <4 x i8> %vec2, i64 1
151  %cv2e1 = zext i8 %v2e1 to i32
152  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
153
154  %v1e2 = extractelement <4 x i8> %vec1, i64 2
155  %cv1e2 = zext i8 %v1e2 to i32
156  %v2e2 = extractelement <4 x i8> %vec2, i64 2
157  %cv2e2 = zext i8 %v2e2 to i32
158  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
159
160  %v1e3 = extractelement <4 x i8> %vec1, i64 3
161  %cv1e3 = zext i8 %v1e3 to i32
162  %v2e3 = extractelement <4 x i8> %vec2, i64 3
163  %cv2e3 = zext i8 %v2e3 to i32
164  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
165
166  %acc = load i32, i32 addrspace(1)* %dst, align 4
167  %mad1 = add i32 %mul1, %acc
168  %mad2 = add i32 %mad1, %mul2
169  %mad3 = add i32 %mad2, %mul3
170  %mad4 = add i32 %mad3, %mul4
171
172  store i32 %mad4, i32 addrspace(1)* %dst, align 4
173  ret void
174}
175
176define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
177; GFX7-LABEL: udot4_acc16:
178; GFX7:       ; %bb.0: ; %entry
179; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
180; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
181; GFX7-NEXT:    s_mov_b32 s3, 0xf000
182; GFX7-NEXT:    s_mov_b32 s2, -1
183; GFX7-NEXT:    s_movk_i32 s8, 0xff
184; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
186; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
187; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
188; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX7-NEXT:    s_and_b32 s7, s4, s8
190; GFX7-NEXT:    s_and_b32 s6, s5, s8
191; GFX7-NEXT:    s_bfe_u32 s8, s5, 0x80008
192; GFX7-NEXT:    v_mov_b32_e32 v1, s6
193; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80010
194; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
195; GFX7-NEXT:    v_mov_b32_e32 v2, s8
196; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
197; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
198; GFX7-NEXT:    v_mov_b32_e32 v3, s10
199; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
200; GFX7-NEXT:    s_waitcnt vmcnt(0)
201; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
202; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
203; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
204; GFX7-NEXT:    v_mov_b32_e32 v1, s5
205; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
206; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
207; GFX7-NEXT:    s_endpgm
208;
209; GFX8-LABEL: udot4_acc16:
210; GFX8:       ; %bb.0: ; %entry
211; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
212; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
213; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
214; GFX8-NEXT:    v_mov_b32_e32 v0, s0
215; GFX8-NEXT:    v_mov_b32_e32 v1, s1
216; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
217; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
218; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
219; GFX8-NEXT:    s_movk_i32 s0, 0xff
220; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX8-NEXT:    s_and_b32 s3, s1, s0
222; GFX8-NEXT:    s_and_b32 s0, s2, s0
223; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
224; GFX8-NEXT:    v_mov_b32_e32 v3, s0
225; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
226; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
227; GFX8-NEXT:    v_mov_b32_e32 v4, s5
228; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
229; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
230; GFX8-NEXT:    v_mov_b32_e32 v5, s7
231; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
232; GFX8-NEXT:    s_waitcnt vmcnt(0)
233; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
234; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v4, v2
235; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
236; GFX8-NEXT:    v_mov_b32_e32 v3, s2
237; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
238; GFX8-NEXT:    flat_store_short v[0:1], v2
239; GFX8-NEXT:    s_endpgm
240;
241; GFX9-NODL-LABEL: udot4_acc16:
242; GFX9-NODL:       ; %bb.0: ; %entry
243; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
244; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
245; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
246; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
247; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
249; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
250; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
251; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
253; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
254; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
255; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
256; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
257; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
258; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
259; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
260; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
261; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s9
262; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
263; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
264; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
265; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v3, v1
266; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v4, v1
267; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
268; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
269; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
270; GFX9-NODL-NEXT:    s_endpgm
271;
272; GFX9-DL-LABEL: udot4_acc16:
273; GFX9-DL:       ; %bb.0: ; %entry
274; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
275; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
276; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
277; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
279; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
280; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
281; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
283; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
284; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s2, v2, v1
285; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
286; GFX9-DL-NEXT:    s_endpgm
287;
288; GFX10-DL-LABEL: udot4_acc16:
289; GFX10-DL:       ; %bb.0: ; %entry
290; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
291; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
292; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
293; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
295; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
296; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
297; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
298; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, s0, s1, v1
299; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
300; GFX10-DL-NEXT:    s_endpgm
301                                       <4 x i8> addrspace(1)* %src2,
302                                       i16 addrspace(1)* nocapture %dst) {
303entry:
304  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
305  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
306
307  %v1e0 = extractelement <4 x i8> %vec1, i64 0
308  %cv1e0 = zext i8 %v1e0 to i16
309  %v2e0 = extractelement <4 x i8> %vec2, i64 0
310  %cv2e0 = zext i8 %v2e0 to i16
311  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
312
313  %v1e1 = extractelement <4 x i8> %vec1, i64 1
314  %cv1e1 = zext i8 %v1e1 to i16
315  %v2e1 = extractelement <4 x i8> %vec2, i64 1
316  %cv2e1 = zext i8 %v2e1 to i16
317  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
318
319  %v1e2 = extractelement <4 x i8> %vec1, i64 2
320  %cv1e2 = zext i8 %v1e2 to i16
321  %v2e2 = extractelement <4 x i8> %vec2, i64 2
322  %cv2e2 = zext i8 %v2e2 to i16
323  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
324
325  %v1e3 = extractelement <4 x i8> %vec1, i64 3
326  %cv1e3 = zext i8 %v1e3 to i16
327  %v2e3 = extractelement <4 x i8> %vec2, i64 3
328  %cv2e3 = zext i8 %v2e3 to i16
329  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
330
331  %acc = load i16, i16 addrspace(1)* %dst, align 2
332  %mad1 = add i16 %mul1, %acc
333  %mad2 = add i16 %mad1, %mul2
334  %mad3 = add i16 %mad2, %mul3
335  %mad4 = add i16 %mad3, %mul4
336
337  store i16 %mad4, i16 addrspace(1)* %dst, align 2
338  ret void
339}
340
341define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
342; GFX7-LABEL: udot4_acc8:
343; GFX7:       ; %bb.0: ; %entry
344; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
345; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
346; GFX7-NEXT:    s_mov_b32 s3, 0xf000
347; GFX7-NEXT:    s_mov_b32 s2, -1
348; GFX7-NEXT:    s_movk_i32 s8, 0xff
349; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
350; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
351; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
352; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
353; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX7-NEXT:    s_and_b32 s7, s4, s8
355; GFX7-NEXT:    s_and_b32 s6, s5, s8
356; GFX7-NEXT:    s_bfe_u32 s8, s5, 0x80008
357; GFX7-NEXT:    v_mov_b32_e32 v1, s6
358; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80010
359; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
360; GFX7-NEXT:    v_mov_b32_e32 v2, s8
361; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
362; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
363; GFX7-NEXT:    v_mov_b32_e32 v3, s10
364; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
365; GFX7-NEXT:    s_waitcnt vmcnt(0)
366; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
367; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
368; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
369; GFX7-NEXT:    v_mov_b32_e32 v1, s5
370; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
371; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
372; GFX7-NEXT:    s_endpgm
373;
374; GFX8-LABEL: udot4_acc8:
375; GFX8:       ; %bb.0: ; %entry
376; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
377; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
378; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX8-NEXT:    v_mov_b32_e32 v0, s0
380; GFX8-NEXT:    v_mov_b32_e32 v1, s1
381; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
382; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
383; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
384; GFX8-NEXT:    s_movk_i32 s0, 0xff
385; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
387; GFX8-NEXT:    s_and_b32 s3, s2, s0
388; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x80008
389; GFX8-NEXT:    s_and_b32 s0, s1, s0
390; GFX8-NEXT:    v_mov_b32_e32 v3, s3
391; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x80010
392; GFX8-NEXT:    v_mov_b32_e32 v4, s4
393; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x80010
394; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
395; GFX8-NEXT:    v_mov_b32_e32 v5, s6
396; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
397; GFX8-NEXT:    s_waitcnt vmcnt(0)
398; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
399; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
400; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
401; GFX8-NEXT:    v_mov_b32_e32 v3, s2
402; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
403; GFX8-NEXT:    flat_store_byte v[0:1], v2
404; GFX8-NEXT:    s_endpgm
405;
406; GFX9-NODL-LABEL: udot4_acc8:
407; GFX9-NODL:       ; %bb.0: ; %entry
408; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
409; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
410; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
411; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
412; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
414; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
415; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
416; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
417; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
418; GFX9-NODL-NEXT:    s_and_b32 s5, s4, s2
419; GFX9-NODL-NEXT:    s_bfe_u32 s6, s4, 0x80008
420; GFX9-NODL-NEXT:    s_and_b32 s2, s3, s2
421; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
422; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80010
423; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
424; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
425; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
426; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
427; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
428; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
429; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
430; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v3, v1
431; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
432; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
433; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
434; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
435; GFX9-NODL-NEXT:    s_endpgm
436;
437; GFX9-DL-LABEL: udot4_acc8:
438; GFX9-DL:       ; %bb.0: ; %entry
439; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
440; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
441; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
442; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
444; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
445; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
446; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
448; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
449; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s2, v2, v1
450; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
451; GFX9-DL-NEXT:    s_endpgm
452;
453; GFX10-DL-LABEL: udot4_acc8:
454; GFX10-DL:       ; %bb.0: ; %entry
455; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
456; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
457; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
458; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
460; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
461; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
462; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
463; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, s0, s1, v1
464; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
465; GFX10-DL-NEXT:    s_endpgm
466                                      <4 x i8> addrspace(1)* %src2,
467                                      i8 addrspace(1)* nocapture %dst) {
468entry:
469  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
470  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
471
472  %v1e0 = extractelement <4 x i8> %vec1, i64 0
473  %v2e0 = extractelement <4 x i8> %vec2, i64 0
474  %mul1 = mul nuw nsw i8 %v1e0, %v2e0
475
476  %v1e1 = extractelement <4 x i8> %vec1, i64 1
477  %v2e1 = extractelement <4 x i8> %vec2, i64 1
478  %mul2 = mul nuw nsw i8 %v1e1, %v2e1
479
480  %v1e2 = extractelement <4 x i8> %vec1, i64 2
481  %v2e2 = extractelement <4 x i8> %vec2, i64 2
482  %mul3 = mul nuw nsw i8 %v1e2, %v2e2
483
484  %v1e3 = extractelement <4 x i8> %vec1, i64 3
485  %v2e3 = extractelement <4 x i8> %vec2, i64 3
486  %mul4 = mul nuw nsw i8 %v1e3, %v2e3
487
488  %acc = load i8, i8 addrspace(1)* %dst, align 2
489  %mad1 = add i8 %mul1, %acc
490  %mad2 = add i8 %mad1, %mul2
491  %mad3 = add i8 %mad2, %mul3
492  %mad4 = add i8 %mad3, %mul4
493
494  store i8 %mad4, i8 addrspace(1)* %dst, align 2
495  ret void
496}
497
498; TODO: Generate udot4?
499define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
500; GFX7-LABEL: udot2_8:
501; GFX7:       ; %bb.0: ; %entry
502; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
503; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
504; GFX7-NEXT:    s_mov_b32 s3, 0xf000
505; GFX7-NEXT:    s_mov_b32 s2, -1
506; GFX7-NEXT:    s_movk_i32 s8, 0xff
507; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
509; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
510; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
511; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX7-NEXT:    s_and_b32 s7, s4, s8
513; GFX7-NEXT:    s_and_b32 s6, s5, s8
514; GFX7-NEXT:    v_mov_b32_e32 v1, s6
515; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x80008
516; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x80008
517; GFX7-NEXT:    s_waitcnt vmcnt(0)
518; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
519; GFX7-NEXT:    v_mov_b32_e32 v1, s5
520; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
521; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
522; GFX7-NEXT:    s_endpgm
523;
524; GFX8-LABEL: udot2_8:
525; GFX8:       ; %bb.0: ; %entry
526; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
527; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
528; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX8-NEXT:    v_mov_b32_e32 v0, s0
530; GFX8-NEXT:    v_mov_b32_e32 v1, s1
531; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
532; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
533; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
534; GFX8-NEXT:    s_movk_i32 s0, 0xff
535; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
536; GFX8-NEXT:    s_and_b32 s3, s2, s0
537; GFX8-NEXT:    s_and_b32 s0, s1, s0
538; GFX8-NEXT:    v_mov_b32_e32 v3, s3
539; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x80008
540; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80008
541; GFX8-NEXT:    s_waitcnt vmcnt(0)
542; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
543; GFX8-NEXT:    v_mov_b32_e32 v3, s2
544; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
545; GFX8-NEXT:    flat_store_byte v[0:1], v2
546; GFX8-NEXT:    s_endpgm
547;
548; GFX9-NODL-LABEL: udot2_8:
549; GFX9-NODL:       ; %bb.0: ; %entry
550; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
551; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
552; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
553; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
554; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
556; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
557; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
558; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX9-NODL-NEXT:    s_and_b32 s5, s4, s2
560; GFX9-NODL-NEXT:    s_and_b32 s2, s3, s2
561; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
562; GFX9-NODL-NEXT:    s_bfe_u32 s4, s4, 0x80008
563; GFX9-NODL-NEXT:    s_bfe_u32 s3, s3, 0x80008
564; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
565; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
566; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
567; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
568; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
569; GFX9-NODL-NEXT:    s_endpgm
570;
571; GFX9-DL-LABEL: udot2_8:
572; GFX9-DL:       ; %bb.0: ; %entry
573; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
574; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
575; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
576; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
577; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
579; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
580; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
581; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX9-DL-NEXT:    s_and_b32 s5, s4, s2
583; GFX9-DL-NEXT:    s_and_b32 s2, s3, s2
584; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
585; GFX9-DL-NEXT:    s_bfe_u32 s4, s4, 0x80008
586; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x80008
587; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
588; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
589; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
590; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
591; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
592; GFX9-DL-NEXT:    s_endpgm
593;
594; GFX10-DL-LABEL: udot2_8:
595; GFX10-DL:       ; %bb.0: ; %entry
596; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
597; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
598; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
599; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
600; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
601; GFX10-DL-NEXT:    s_load_dword s2, s[2:3], 0x0
602; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
603; GFX10-DL-NEXT:    s_movk_i32 s1, 0xff
604; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX10-DL-NEXT:    s_and_b32 s3, s2, s1
606; GFX10-DL-NEXT:    s_and_b32 s1, s0, s1
607; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x80008
608; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
609; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s1, s3, v1
610; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x80008
611; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
612; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
613; GFX10-DL-NEXT:    s_endpgm
614                                   <4 x i8> addrspace(1)* %src2,
615                                   i8 addrspace(1)* nocapture %dst) {
616entry:
617  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
618  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
619
620  %v1e0 = extractelement <4 x i8> %vec1, i64 0
621  %v2e0 = extractelement <4 x i8> %vec2, i64 0
622  %mul1 = mul nuw nsw i8 %v1e0, %v2e0
623
624  %v1e1 = extractelement <4 x i8> %vec1, i64 1
625  %v2e1 = extractelement <4 x i8> %vec2, i64 1
626  %mul2 = mul nuw nsw i8 %v1e1, %v2e1
627
628  %acc = load i8, i8 addrspace(1)* %dst, align 2
629  %mad1 = add i8 %mul1, %acc
630  %mad2 = add i8 %mad1, %mul2
631  store i8 %mad2, i8 addrspace(1)* %dst, align 2
632  ret void
633}
634
635define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1,
636; GFX7-LABEL: udot4_CommutationInsideMAD:
637; GFX7:       ; %bb.0: ; %entry
638; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
639; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
640; GFX7-NEXT:    s_mov_b32 s3, 0xf000
641; GFX7-NEXT:    s_mov_b32 s2, -1
642; GFX7-NEXT:    s_movk_i32 s8, 0xff
643; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
644; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
645; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
646; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
647; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX7-NEXT:    s_and_b32 s6, s4, s8
649; GFX7-NEXT:    s_and_b32 s7, s5, s8
650; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
651; GFX7-NEXT:    v_mov_b32_e32 v1, s6
652; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
653; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
654; GFX7-NEXT:    v_mov_b32_e32 v2, s8
655; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
656; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
657; GFX7-NEXT:    v_mov_b32_e32 v3, s10
658; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
659; GFX7-NEXT:    s_waitcnt vmcnt(0)
660; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
661; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
662; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
663; GFX7-NEXT:    v_mov_b32_e32 v1, s4
664; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
665; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
666; GFX7-NEXT:    s_endpgm
667;
668; GFX8-LABEL: udot4_CommutationInsideMAD:
669; GFX8:       ; %bb.0: ; %entry
670; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
671; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
672; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
673; GFX8-NEXT:    v_mov_b32_e32 v0, s0
674; GFX8-NEXT:    v_mov_b32_e32 v1, s1
675; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
676; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
677; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
678; GFX8-NEXT:    s_movk_i32 s0, 0xff
679; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
680; GFX8-NEXT:    s_and_b32 s3, s1, s0
681; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
682; GFX8-NEXT:    s_and_b32 s0, s2, s0
683; GFX8-NEXT:    v_mov_b32_e32 v3, s3
684; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
685; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
686; GFX8-NEXT:    v_mov_b32_e32 v4, s4
687; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
688; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
689; GFX8-NEXT:    v_mov_b32_e32 v5, s6
690; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
691; GFX8-NEXT:    s_waitcnt vmcnt(0)
692; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
693; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
694; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
695; GFX8-NEXT:    v_mov_b32_e32 v3, s1
696; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
697; GFX8-NEXT:    flat_store_byte v[0:1], v2
698; GFX8-NEXT:    s_endpgm
699;
700; GFX9-NODL-LABEL: udot4_CommutationInsideMAD:
701; GFX9-NODL:       ; %bb.0: ; %entry
702; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
703; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
704; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
705; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
706; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
708; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
709; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
710; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
711; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
712; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
713; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
714; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
715; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
716; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
717; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
718; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
719; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
720; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
721; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
722; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
723; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
724; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v3, v1
725; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
726; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
727; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
728; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
729; GFX9-NODL-NEXT:    s_endpgm
730;
731; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
732; GFX9-DL:       ; %bb.0: ; %entry
733; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
734; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
735; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
736; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
737; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
738; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
739; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
740; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
742; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
743; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s3, v2, v1
744; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
745; GFX9-DL-NEXT:    s_endpgm
746;
747; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
748; GFX10-DL:       ; %bb.0: ; %entry
749; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
750; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
751; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
752; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
754; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
755; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
756; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
757; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, s1, s0, v1
758; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
759; GFX10-DL-NEXT:    s_endpgm
760                                                      <4 x i8> addrspace(1)* %src2,
761                                                      i8 addrspace(1)* nocapture %dst) {
762entry:
763  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
764  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
765
766  %v1e0 = extractelement <4 x i8> %vec1, i64 0
767  %v2e0 = extractelement <4 x i8> %vec2, i64 0
768  %mul1 = mul nuw nsw i8 %v2e0, %v1e0
769
770  %v1e1 = extractelement <4 x i8> %vec1, i64 1
771  %v2e1 = extractelement <4 x i8> %vec2, i64 1
772  %mul2 = mul nuw nsw i8 %v2e1, %v1e1
773
774  %v1e2 = extractelement <4 x i8> %vec1, i64 2
775  %v2e2 = extractelement <4 x i8> %vec2, i64 2
776  %mul3 = mul nuw nsw i8 %v2e2, %v1e2
777
778  %v1e3 = extractelement <4 x i8> %vec1, i64 3
779  %v2e3 = extractelement <4 x i8> %vec2, i64 3
780  %mul4 = mul nuw nsw i8 %v2e3, %v1e3
781
782  %acc = load i8, i8 addrspace(1)* %dst, align 2
783  %mad1 = add i8 %acc, %mul1
784  %mad2 = add i8 %mul2, %mad1
785  %mad3 = add i8 %mul3, %mad2
786  %mad4 = add i8 %mul4, %mad3
787
788  store i8 %mad4, i8 addrspace(1)* %dst, align 2
789  ret void
790}
791
792; TODO: Support commutation accross the adds.
793define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1,
794; GFX7-LABEL: udot4_CommutationAccrossMADs:
795; GFX7:       ; %bb.0: ; %entry
796; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
797; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
798; GFX7-NEXT:    s_mov_b32 s3, 0xf000
799; GFX7-NEXT:    s_mov_b32 s2, -1
800; GFX7-NEXT:    s_movk_i32 s8, 0xff
801; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
803; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
804; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
805; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
806; GFX7-NEXT:    s_and_b32 s6, s4, s8
807; GFX7-NEXT:    s_and_b32 s7, s5, s8
808; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
809; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
810; GFX7-NEXT:    v_mov_b32_e32 v1, s8
811; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
812; GFX7-NEXT:    v_mov_b32_e32 v2, s6
813; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
814; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
815; GFX7-NEXT:    v_mov_b32_e32 v3, s10
816; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
817; GFX7-NEXT:    s_waitcnt vmcnt(0)
818; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
819; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v2, v0
820; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
821; GFX7-NEXT:    v_mov_b32_e32 v1, s4
822; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
823; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
824; GFX7-NEXT:    s_endpgm
825;
826; GFX8-LABEL: udot4_CommutationAccrossMADs:
827; GFX8:       ; %bb.0: ; %entry
828; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
829; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
830; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
831; GFX8-NEXT:    v_mov_b32_e32 v0, s0
832; GFX8-NEXT:    v_mov_b32_e32 v1, s1
833; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
834; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
835; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
836; GFX8-NEXT:    s_movk_i32 s0, 0xff
837; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
838; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
839; GFX8-NEXT:    s_and_b32 s3, s1, s0
840; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
841; GFX8-NEXT:    v_mov_b32_e32 v3, s4
842; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
843; GFX8-NEXT:    s_and_b32 s0, s2, s0
844; GFX8-NEXT:    v_mov_b32_e32 v4, s3
845; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
846; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
847; GFX8-NEXT:    v_mov_b32_e32 v5, s6
848; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
849; GFX8-NEXT:    s_waitcnt vmcnt(0)
850; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
851; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v4, v2
852; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
853; GFX8-NEXT:    v_mov_b32_e32 v3, s1
854; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
855; GFX8-NEXT:    flat_store_byte v[0:1], v2
856; GFX8-NEXT:    s_endpgm
857;
858; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs:
859; GFX9-NODL:       ; %bb.0: ; %entry
860; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
861; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
862; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
863; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
864; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
865; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
866; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
867; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
868; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
869; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
870; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
871; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
872; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
873; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
874; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
875; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
876; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
877; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
878; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
879; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
880; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
881; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
882; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v3, v1
883; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
884; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
885; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
886; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
887; GFX9-NODL-NEXT:    s_endpgm
888;
889; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
890; GFX9-DL:       ; %bb.0: ; %entry
891; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
892; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
893; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
894; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
895; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
896; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
897; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
898; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
899; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
900; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
901; GFX9-DL-NEXT:    s_and_b32 s5, s3, s2
902; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
903; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
904; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
905; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
906; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
907; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
908; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
909; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s8
910; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 24
911; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
912; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
913; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v3, v1
914; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
915; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
916; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
917; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
918; GFX9-DL-NEXT:    s_endpgm
919;
920; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
921; GFX10-DL:       ; %bb.0: ; %entry
922; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
923; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
924; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
925; GFX10-DL-NEXT:    s_movk_i32 s6, 0xff
926; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
928; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
929; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
930; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80008
932; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80008
933; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
934; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s3, s2, v1
935; GFX10-DL-NEXT:    s_and_b32 s2, s0, s6
936; GFX10-DL-NEXT:    s_and_b32 s3, s1, s6
937; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s3, s2, v1
938; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80010
939; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80010
940; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
941; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
942; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s3, s2, v1
943; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s1, s0, v1
944; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
945; GFX10-DL-NEXT:    s_endpgm
946                                                        <4 x i8> addrspace(1)* %src2,
947                                                        i8 addrspace(1)* nocapture %dst) {
948entry:
949  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
950  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
951
952  %v1e0 = extractelement <4 x i8> %vec1, i64 0
953  %v2e0 = extractelement <4 x i8> %vec2, i64 0
954  %mul1 = mul nuw nsw i8 %v2e0, %v1e0
955
956  %v1e1 = extractelement <4 x i8> %vec1, i64 1
957  %v2e1 = extractelement <4 x i8> %vec2, i64 1
958  %mul2 = mul nuw nsw i8 %v2e1, %v1e1
959
960  %v1e2 = extractelement <4 x i8> %vec1, i64 2
961  %v2e2 = extractelement <4 x i8> %vec2, i64 2
962  %mul3 = mul nuw nsw i8 %v2e2, %v1e2
963
964  %v1e3 = extractelement <4 x i8> %vec1, i64 3
965  %v2e3 = extractelement <4 x i8> %vec2, i64 3
966  %mul4 = mul nuw nsw i8 %v2e3, %v1e3
967
968  %acc = load i8, i8 addrspace(1)* %dst, align 2
969  %mad1 = add i8 %acc, %mul2
970  %mad2 = add i8 %mad1, %mul1
971  %mad3 = add i8 %mad2, %mul3
972  %mad4 = add i8 %mad3, %mul4
973
974  store i8 %mad4, i8 addrspace(1)* %dst, align 2
975  ret void
976}
977
978define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
979; GFX7-LABEL: udot4_multiuse_mul1:
980; GFX7:       ; %bb.0: ; %entry
981; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
982; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
983; GFX7-NEXT:    s_movk_i32 s8, 0xff
984; GFX7-NEXT:    s_mov_b32 s3, 0xf000
985; GFX7-NEXT:    s_mov_b32 s2, -1
986; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
988; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
989; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
990; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX7-NEXT:    s_and_b32 s6, s4, s8
992; GFX7-NEXT:    s_and_b32 s7, s5, s8
993; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
994; GFX7-NEXT:    v_mov_b32_e32 v0, s7
995; GFX7-NEXT:    v_mov_b32_e32 v1, s12
996; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
997; GFX7-NEXT:    v_mad_u32_u24 v1, s6, v0, v1
998; GFX7-NEXT:    v_mov_b32_e32 v2, s9
999; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
1000; GFX7-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
1001; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
1002; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
1003; GFX7-NEXT:    v_mov_b32_e32 v1, s11
1004; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
1005; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
1006; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
1007; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1008; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
1009; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1010; GFX7-NEXT:    s_endpgm
1011;
1012; GFX8-LABEL: udot4_multiuse_mul1:
1013; GFX8:       ; %bb.0: ; %entry
1014; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1015; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1016; GFX8-NEXT:    s_movk_i32 s2, 0xff
1017; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1018; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
1019; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
1020; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
1021; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1022; GFX8-NEXT:    s_and_b32 s5, s3, s2
1023; GFX8-NEXT:    s_and_b32 s2, s4, s2
1024; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x80008
1025; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1026; GFX8-NEXT:    v_mov_b32_e32 v1, s10
1027; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x80008
1028; GFX8-NEXT:    v_mad_u32_u24 v1, s5, v0, v1
1029; GFX8-NEXT:    v_mov_b32_e32 v2, s7
1030; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x80010
1031; GFX8-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
1032; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x80010
1033; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
1034; GFX8-NEXT:    v_mov_b32_e32 v1, s9
1035; GFX8-NEXT:    s_lshr_b32 s4, s4, 24
1036; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
1037; GFX8-NEXT:    s_lshr_b32 s3, s3, 24
1038; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1039; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
1040; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1041; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1042; GFX8-NEXT:    flat_store_dword v[0:1], v2
1043; GFX8-NEXT:    s_endpgm
1044;
1045; GFX9-NODL-LABEL: udot4_multiuse_mul1:
1046; GFX9-NODL:       ; %bb.0: ; %entry
1047; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1048; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1049; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
1050; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1051; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
1053; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
1054; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
1055; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1056; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
1057; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
1058; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
1059; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
1060; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
1061; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
1062; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v1, v2
1063; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
1064; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
1065; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
1066; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
1067; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
1068; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
1069; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
1070; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
1071; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
1072; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
1073; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
1074; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1075; GFX9-NODL-NEXT:    s_endpgm
1076;
1077; GFX9-DL-LABEL: udot4_multiuse_mul1:
1078; GFX9-DL:       ; %bb.0: ; %entry
1079; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1080; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1081; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
1082; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1083; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
1085; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
1086; GFX9-DL-NEXT:    s_load_dword s10, s[0:1], 0x0
1087; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX9-DL-NEXT:    s_and_b32 s5, s3, s2
1089; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
1090; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
1091; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
1092; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
1093; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
1094; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v1, v2
1095; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
1096; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
1097; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
1098; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
1099; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
1100; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s9
1101; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 24
1102; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
1103; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
1104; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
1105; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
1106; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1107; GFX9-DL-NEXT:    s_endpgm
1108;
1109; GFX10-DL-LABEL: udot4_multiuse_mul1:
1110; GFX10-DL:       ; %bb.0: ; %entry
1111; GFX10-DL-NEXT:    s_clause 0x1
1112; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1113; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1114; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1115; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1117; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1118; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1119; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
1120; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1121; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1122; GFX10-DL-NEXT:    s_and_b32 s3, s0, s2
1123; GFX10-DL-NEXT:    s_and_b32 s2, s1, s2
1124; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x80008
1125; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x80008
1126; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
1127; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s7, v0
1128; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
1129; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80010
1130; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80010
1131; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
1132; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
1133; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
1134; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s0, s1, v0
1135; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
1136; GFX10-DL-NEXT:    s_endpgm
1137                                               <4 x i8> addrspace(1)* %src2,
1138                                               i32 addrspace(1)* nocapture %dst) {
1139entry:
1140  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1141  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1142
1143  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1144  %cv1e0 = zext i8 %v1e0 to i32
1145  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1146  %cv2e0 = zext i8 %v2e0 to i32
1147  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1148
1149  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1150  %cv1e1 = zext i8 %v1e1 to i32
1151  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1152  %cv2e1 = zext i8 %v2e1 to i32
1153  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1154
1155  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1156  %cv1e2 = zext i8 %v1e2 to i32
1157  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1158  %cv2e2 = zext i8 %v2e2 to i32
1159  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1160
1161  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1162  %cv1e3 = zext i8 %v1e3 to i32
1163  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1164  %cv2e3 = zext i8 %v2e3 to i32
1165  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1166
1167  %acc = load i32, i32 addrspace(1)* %dst, align 4
1168  %add = add i32 %mul1, %acc
1169  %add1 = add i32 %mul2, %add
1170  %add2 = add i32 %add1, %mul1
1171  %add3 = add i32 %add2, %mul3
1172  %add4 = add i32 %add3, %mul4
1173
1174  store i32 %add4, i32 addrspace(1)* %dst, align 4
1175  ret void
1176}
1177
1178define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
1179; GFX7-LABEL: udot4_multiuse_add1:
1180; GFX7:       ; %bb.0: ; %entry
1181; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1182; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1183; GFX7-NEXT:    s_movk_i32 s8, 0xff
1184; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1185; GFX7-NEXT:    s_mov_b32 s2, -1
1186; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1188; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1189; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
1190; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1191; GFX7-NEXT:    s_and_b32 s6, s4, s8
1192; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
1193; GFX7-NEXT:    s_and_b32 s7, s5, s8
1194; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
1195; GFX7-NEXT:    v_mov_b32_e32 v0, s9
1196; GFX7-NEXT:    v_mov_b32_e32 v1, s12
1197; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v0, v1
1198; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
1199; GFX7-NEXT:    v_mov_b32_e32 v2, s7
1200; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
1201; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s12, v0
1202; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v2, v0
1203; GFX7-NEXT:    v_mov_b32_e32 v2, s11
1204; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
1205; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v2, v0
1206; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
1207; GFX7-NEXT:    v_mov_b32_e32 v2, s5
1208; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v2, v0
1209; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1210; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1211; GFX7-NEXT:    s_endpgm
1212;
1213; GFX8-LABEL: udot4_multiuse_add1:
1214; GFX8:       ; %bb.0: ; %entry
1215; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1216; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1217; GFX8-NEXT:    s_movk_i32 s2, 0xff
1218; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1219; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
1220; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
1221; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
1222; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1223; GFX8-NEXT:    s_and_b32 s5, s3, s2
1224; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x80008
1225; GFX8-NEXT:    s_and_b32 s2, s4, s2
1226; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x80008
1227; GFX8-NEXT:    v_mov_b32_e32 v0, s7
1228; GFX8-NEXT:    v_mov_b32_e32 v1, s10
1229; GFX8-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
1230; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x80010
1231; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1232; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x80010
1233; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s10, v0
1234; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v2, v0
1235; GFX8-NEXT:    v_mov_b32_e32 v2, s9
1236; GFX8-NEXT:    s_lshr_b32 s4, s4, 24
1237; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v2, v0
1238; GFX8-NEXT:    s_lshr_b32 s3, s3, 24
1239; GFX8-NEXT:    v_mov_b32_e32 v2, s4
1240; GFX8-NEXT:    v_mad_u32_u24 v0, s3, v2, v0
1241; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
1242; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1243; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1244; GFX8-NEXT:    flat_store_dword v[0:1], v2
1245; GFX8-NEXT:    s_endpgm
1246;
1247; GFX9-NODL-LABEL: udot4_multiuse_add1:
1248; GFX9-NODL:       ; %bb.0: ; %entry
1249; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1250; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1251; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
1252; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1253; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1254; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
1255; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
1256; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
1257; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1258; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
1259; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
1260; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
1261; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
1262; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
1263; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
1264; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v1, v2
1265; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
1266; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
1267; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
1268; GFX9-NODL-NEXT:    v_add_u32_e32 v2, s10, v1
1269; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v3, v1
1270; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s9
1271; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
1272; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v3, v1
1273; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
1274; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s4
1275; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v3, v1
1276; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v2
1277; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1278; GFX9-NODL-NEXT:    s_endpgm
1279;
1280; GFX9-DL-LABEL: udot4_multiuse_add1:
1281; GFX9-DL:       ; %bb.0: ; %entry
1282; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1283; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1284; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
1285; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1286; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
1288; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
1289; GFX9-DL-NEXT:    s_load_dword s10, s[0:1], 0x0
1290; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1291; GFX9-DL-NEXT:    s_and_b32 s5, s3, s2
1292; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
1293; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
1294; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
1295; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
1296; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
1297; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v1, v2
1298; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
1299; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
1300; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
1301; GFX9-DL-NEXT:    v_add_u32_e32 v2, s10, v1
1302; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v3, v1
1303; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
1304; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 24
1305; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v3, v1
1306; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
1307; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
1308; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v3, v1
1309; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
1310; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1311; GFX9-DL-NEXT:    s_endpgm
1312;
1313; GFX10-DL-LABEL: udot4_multiuse_add1:
1314; GFX10-DL:       ; %bb.0: ; %entry
1315; GFX10-DL-NEXT:    s_clause 0x1
1316; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1317; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1318; GFX10-DL-NEXT:    s_movk_i32 s7, 0xff
1319; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1320; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1321; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1322; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1323; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1324; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1325; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1326; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80008
1327; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80008
1328; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
1329; GFX10-DL-NEXT:    s_and_b32 s2, s0, s7
1330; GFX10-DL-NEXT:    s_and_b32 s3, s1, s7
1331; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v0
1332; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80010
1333; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80010
1334; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
1335; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
1336; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1337; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, s6, v0
1338; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
1339; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
1340; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
1341; GFX10-DL-NEXT:    s_endpgm
1342                                               <4 x i8> addrspace(1)* %src2,
1343                                               i32 addrspace(1)* nocapture %dst) {
1344entry:
1345  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1346  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1347
1348  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1349  %cv1e0 = zext i8 %v1e0 to i32
1350  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1351  %cv2e0 = zext i8 %v2e0 to i32
1352  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1353
1354  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1355  %cv1e1 = zext i8 %v1e1 to i32
1356  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1357  %cv2e1 = zext i8 %v2e1 to i32
1358  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1359
1360  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1361  %cv1e2 = zext i8 %v1e2 to i32
1362  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1363  %cv2e2 = zext i8 %v2e2 to i32
1364  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1365
1366  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1367  %cv1e3 = zext i8 %v1e3 to i32
1368  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1369  %cv2e3 = zext i8 %v2e3 to i32
1370  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1371
1372  %acc = load i32, i32 addrspace(1)* %dst, align 4
1373  %add1 = add i32 %mul2, %acc
1374  %add = add i32 %add1, %acc
1375  %add2 = add i32 %add1, %mul1
1376  %add3 = add i32 %add2, %mul3
1377  %add4 = add i32 %add3, %mul4
1378  %res = add i32 %add4, %add
1379  store i32 %res, i32 addrspace(1)* %dst, align 4
1380  ret void
1381}
1382
1383define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
1384; GFX7-LABEL: notdot4_mixedtypes:
1385; GFX7:       ; %bb.0: ; %entry
1386; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1387; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1388; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1389; GFX7-NEXT:    s_mov_b32 s2, -1
1390; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1391; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1392; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1393; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1394; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1395; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1396; GFX7-NEXT:    s_sext_i32_i8 s6, s4
1397; GFX7-NEXT:    s_sext_i32_i8 s7, s5
1398; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
1399; GFX7-NEXT:    s_and_b32 s7, s7, s8
1400; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80008
1401; GFX7-NEXT:    v_mov_b32_e32 v1, s9
1402; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
1403; GFX7-NEXT:    s_and_b32 s6, s6, s8
1404; GFX7-NEXT:    v_mov_b32_e32 v3, s7
1405; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x80010
1406; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
1407; GFX7-NEXT:    v_mov_b32_e32 v2, s11
1408; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
1409; GFX7-NEXT:    s_waitcnt vmcnt(0)
1410; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
1411; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v3, v0
1412; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
1413; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1414; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
1415; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1416; GFX7-NEXT:    s_endpgm
1417;
1418; GFX8-LABEL: notdot4_mixedtypes:
1419; GFX8:       ; %bb.0: ; %entry
1420; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1421; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1422; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1423; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1424; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1425; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
1426; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1427; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
1428; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1429; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80008
1430; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
1431; GFX8-NEXT:    s_sext_i32_i8 s3, s1
1432; GFX8-NEXT:    v_mov_b32_e32 v3, s5
1433; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x80010
1434; GFX8-NEXT:    s_sext_i32_i8 s2, s0
1435; GFX8-NEXT:    v_mov_b32_e32 v4, s3
1436; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x80010
1437; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
1438; GFX8-NEXT:    v_mov_b32_e32 v5, s7
1439; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
1440; GFX8-NEXT:    s_waitcnt vmcnt(0)
1441; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
1442; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v4, v2
1443; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
1444; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1445; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
1446; GFX8-NEXT:    flat_store_short v[0:1], v2
1447; GFX8-NEXT:    s_endpgm
1448;
1449; GFX9-NODL-LABEL: notdot4_mixedtypes:
1450; GFX9-NODL:       ; %bb.0: ; %entry
1451; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1452; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1453; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1454; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1455; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
1456; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
1457; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
1458; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1459; GFX9-NODL-NEXT:    s_bfe_u32 s6, s2, 0x80008
1460; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
1461; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
1462; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
1463; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
1464; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
1465; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
1466; GFX9-NODL-NEXT:    s_bfe_u32 s8, s2, 0x80010
1467; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
1468; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s9
1469; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
1470; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1471; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
1472; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v3, v1
1473; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v4, v1
1474; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
1475; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
1476; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
1477; GFX9-NODL-NEXT:    s_endpgm
1478;
1479; GFX9-DL-LABEL: notdot4_mixedtypes:
1480; GFX9-DL:       ; %bb.0: ; %entry
1481; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1482; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1483; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1484; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1485; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
1486; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1487; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
1488; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1489; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x80008
1490; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x80008
1491; GFX9-DL-NEXT:    s_sext_i32_i8 s5, s3
1492; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
1493; GFX9-DL-NEXT:    s_bfe_u32 s9, s3, 0x80010
1494; GFX9-DL-NEXT:    s_sext_i32_i8 s4, s2
1495; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
1496; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x80010
1497; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
1498; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s9
1499; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 24
1500; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1501; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
1502; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v3, v1
1503; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v4, v1
1504; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
1505; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
1506; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
1507; GFX9-DL-NEXT:    s_endpgm
1508;
1509; GFX10-DL-LABEL: notdot4_mixedtypes:
1510; GFX10-DL:       ; %bb.0: ; %entry
1511; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1512; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1513; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1514; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
1516; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1517; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1518; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1519; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80008
1520; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80008
1521; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1522; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1523; GFX10-DL-NEXT:    s_sext_i32_i8 s2, s0
1524; GFX10-DL-NEXT:    s_sext_i32_i8 s3, s1
1525; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1526; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80010
1527; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80010
1528; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
1529; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
1530; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
1531; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
1532; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
1533; GFX10-DL-NEXT:    s_endpgm
1534                                              <4 x i8> addrspace(1)* %src2,
1535                                              i16 addrspace(1)* nocapture %dst) {
1536entry:
1537  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1538  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1539
1540  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1541  %cv1e0 = sext i8 %v1e0 to i16
1542  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1543  %cv2e0 = sext i8 %v2e0 to i16
1544  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
1545
1546  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1547  %cv1e1 = zext i8 %v1e1 to i16
1548  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1549  %cv2e1 = zext i8 %v2e1 to i16
1550  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
1551
1552  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1553  %cv1e2 = zext i8 %v1e2 to i16
1554  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1555  %cv2e2 = zext i8 %v2e2 to i16
1556  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
1557
1558  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1559  %cv1e3 = zext i8 %v1e3 to i16
1560  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1561  %cv2e3 = zext i8 %v2e3 to i16
1562  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
1563
1564  %acc = load i16, i16 addrspace(1)* %dst, align 2
1565  %add1 = add i16 %mul2, %acc
1566  %add2 = add i16 %add1, %mul1
1567  %add3 = add i16 %add2, %mul3
1568  %add4 = add i16 %add3, %mul4
1569
1570  store i16 %add4, i16 addrspace(1)* %dst, align 2
1571  ret void
1572}
1573
1574; TODO: cleanup s_lshr_b32 and support this pattern.
1575define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
1576; GFX7-LABEL: udot4_acc32_vecMul:
1577; GFX7:       ; %bb.0: ; %entry
1578; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1579; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1580; GFX7-NEXT:    s_movk_i32 s11, 0xff
1581; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1582; GFX7-NEXT:    s_mov_b32 s2, -1
1583; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1584; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1585; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1586; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1587; GFX7-NEXT:    s_lshr_b32 s6, s4, 24
1588; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x80008
1589; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
1590; GFX7-NEXT:    s_lshr_b32 s8, s5, 24
1591; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
1592; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x80010
1593; GFX7-NEXT:    s_and_b32 s5, s5, s11
1594; GFX7-NEXT:    s_and_b32 s4, s4, s11
1595; GFX7-NEXT:    s_load_dword s11, s[0:1], 0x0
1596; GFX7-NEXT:    v_mov_b32_e32 v0, s5
1597; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX7-NEXT:    v_mov_b32_e32 v1, s11
1599; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v0, v1
1600; GFX7-NEXT:    v_mov_b32_e32 v1, s9
1601; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
1602; GFX7-NEXT:    v_mov_b32_e32 v1, s12
1603; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
1604; GFX7-NEXT:    v_mov_b32_e32 v1, s8
1605; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
1606; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1607; GFX7-NEXT:    s_endpgm
1608;
1609; GFX8-LABEL: udot4_acc32_vecMul:
1610; GFX8:       ; %bb.0: ; %entry
1611; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1612; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1613; GFX8-NEXT:    s_movk_i32 s2, 0xff
1614; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1615; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
1616; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
1617; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1618; GFX8-NEXT:    s_lshr_b32 s5, s3, 24
1619; GFX8-NEXT:    s_lshr_b32 s6, s4, 24
1620; GFX8-NEXT:    s_bfe_u32 s7, s3, 0x80010
1621; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 8, s3
1622; GFX8-NEXT:    s_and_b32 s3, s3, s2
1623; GFX8-NEXT:    s_and_b32 s2, s4, s2
1624; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x80010
1625; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
1626; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
1627; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1628; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1629; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1630; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v2, v3
1631; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
1632; GFX8-NEXT:    v_mov_b32_e32 v1, s8
1633; GFX8-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
1634; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1635; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v1, v0
1636; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1637; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1638; GFX8-NEXT:    flat_store_dword v[0:1], v2
1639; GFX8-NEXT:    s_endpgm
1640;
1641; GFX9-NODL-LABEL: udot4_acc32_vecMul:
1642; GFX9-NODL:       ; %bb.0: ; %entry
1643; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1644; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1645; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
1646; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1647; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1648; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
1649; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
1650; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1651; GFX9-NODL-NEXT:    s_lshr_b32 s5, s3, 24
1652; GFX9-NODL-NEXT:    s_lshr_b32 s6, s4, 24
1653; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80010
1654; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
1655; GFX9-NODL-NEXT:    s_and_b32 s3, s3, s2
1656; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
1657; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80010
1658; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v2, 8, s4
1659; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
1660; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
1661; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1662; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
1663; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, s3, v3, v4
1664; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v1, v2, v3
1665; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s8
1666; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
1667; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
1668; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
1669; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1670; GFX9-NODL-NEXT:    s_endpgm
1671;
1672; GFX9-DL-LABEL: udot4_acc32_vecMul:
1673; GFX9-DL:       ; %bb.0: ; %entry
1674; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1675; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1676; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
1677; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1678; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1679; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
1680; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
1681; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1682; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 24
1683; GFX9-DL-NEXT:    s_lshr_b32 s6, s4, 24
1684; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x80010
1685; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
1686; GFX9-DL-NEXT:    s_and_b32 s3, s3, s2
1687; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
1688; GFX9-DL-NEXT:    s_bfe_u32 s8, s4, 0x80010
1689; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s4
1690; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
1691; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
1692; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1693; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
1694; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s3, v3, v4
1695; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v1, v2, v3
1696; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s8
1697; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
1698; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
1699; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
1700; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1701; GFX9-DL-NEXT:    s_endpgm
1702;
1703; GFX10-DL-LABEL: udot4_acc32_vecMul:
1704; GFX10-DL:       ; %bb.0: ; %entry
1705; GFX10-DL-NEXT:    s_clause 0x1
1706; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1707; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1708; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1709; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1710; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
1711; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
1712; GFX10-DL-NEXT:    s_movk_i32 s6, 0xff
1713; GFX10-DL-NEXT:    s_mov_b32 s5, 0xffff
1714; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1715; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
1716; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
1717; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
1718; GFX10-DL-NEXT:    s_and_b32 s4, s2, s6
1719; GFX10-DL-NEXT:    s_and_b32 s6, s3, s6
1720; GFX10-DL-NEXT:    v_and_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1721; GFX10-DL-NEXT:    v_and_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1722; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s6, v2
1723; GFX10-DL-NEXT:    s_bfe_u32 s4, s2, 0x80010
1724; GFX10-DL-NEXT:    s_bfe_u32 s5, s3, 0x80010
1725; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 24
1726; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 24
1727; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
1728; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1729; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s5, v0
1730; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
1731; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
1732; GFX10-DL-NEXT:    s_endpgm
1733                                              <4 x i8> addrspace(1)* %src2,
1734                                              i32 addrspace(1)* nocapture %dst) {
1735entry:
1736  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1737  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1738
1739  %cvec1 = zext <4 x i8> %vec1 to <4 x i32>
1740  %cvec2 = zext <4 x i8> %vec2 to <4 x i32>
1741
1742  %mul = mul <4 x i32> %cvec1, %cvec2
1743  %mul0 = extractelement <4 x i32> %mul, i64 0
1744  %mul1 = extractelement <4 x i32> %mul, i64 1
1745  %mul2 = extractelement <4 x i32> %mul, i64 2
1746  %mul3 = extractelement <4 x i32> %mul, i64 3
1747
1748  %acc = load i32, i32 addrspace(1)* %dst, align 4
1749  %add1 = add i32 %mul0, %acc
1750  %add2 = add i32 %add1, %mul1
1751  %add3 = add i32 %add2, %mul2
1752  %add4 = add i32 %add3, %mul3
1753
1754  store i32 %add4, i32 addrspace(1)* %dst, align 4
1755  ret void
1756}
1757
1758; TODO: This pattern should be recognized.
1759define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
1760; GFX7-LABEL: udot4_acc16_vecMul:
1761; GFX7:       ; %bb.0: ; %entry
1762; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1763; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1764; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1765; GFX7-NEXT:    s_mov_b32 s2, -1
1766; GFX7-NEXT:    s_movk_i32 s8, 0xff
1767; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1769; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1770; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1771; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1772; GFX7-NEXT:    s_lshr_b32 s6, s4, 24
1773; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
1774; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x80010
1775; GFX7-NEXT:    s_lshr_b32 s9, s5, 24
1776; GFX7-NEXT:    s_and_b32 s5, s5, s8
1777; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x80008
1778; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
1779; GFX7-NEXT:    s_and_b32 s4, s4, s8
1780; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1781; GFX7-NEXT:    v_mov_b32_e32 v2, s10
1782; GFX7-NEXT:    v_mov_b32_e32 v3, s12
1783; GFX7-NEXT:    s_waitcnt vmcnt(0)
1784; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
1785; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v2, v0
1786; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
1787; GFX7-NEXT:    v_mov_b32_e32 v1, s9
1788; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
1789; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1790; GFX7-NEXT:    s_endpgm
1791;
1792; GFX8-LABEL: udot4_acc16_vecMul:
1793; GFX8:       ; %bb.0: ; %entry
1794; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1795; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1796; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1797; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1798; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1799; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
1800; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
1801; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
1802; GFX8-NEXT:    s_movk_i32 s0, 0xff
1803; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1804; GFX8-NEXT:    s_and_b32 s6, s1, s0
1805; GFX8-NEXT:    s_and_b32 s0, s2, s0
1806; GFX8-NEXT:    v_mov_b32_e32 v5, s0
1807; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
1808; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 8, s2
1809; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
1810; GFX8-NEXT:    s_lshr_b32 s4, s2, 24
1811; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80010
1812; GFX8-NEXT:    v_mov_b32_e32 v6, s7
1813; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
1814; GFX8-NEXT:    s_waitcnt vmcnt(0)
1815; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
1816; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v3, v2
1817; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
1818; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1819; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
1820; GFX8-NEXT:    flat_store_short v[0:1], v2
1821; GFX8-NEXT:    s_endpgm
1822;
1823; GFX9-NODL-LABEL: udot4_acc16_vecMul:
1824; GFX9-NODL:       ; %bb.0: ; %entry
1825; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1826; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1827; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, 0xffff
1828; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1829; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1830; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
1831; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
1832; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1833; GFX9-NODL-NEXT:    s_lshr_b32 s5, s2, 16
1834; GFX9-NODL-NEXT:    s_lshr_b32 s7, s3, 16
1835; GFX9-NODL-NEXT:    s_lshr_b32 s4, s2, 24
1836; GFX9-NODL-NEXT:    v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1837; GFX9-NODL-NEXT:    s_lshr_b32 s6, s3, 24
1838; GFX9-NODL-NEXT:    v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1839; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, s6, 16, v4
1840; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, s4, 16, v5
1841; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
1842; GFX9-NODL-NEXT:    v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1843; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
1844; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
1845; GFX9-NODL-NEXT:    v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1846; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
1847; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
1848; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1849; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[0:1]
1850; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1851; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v1, v2
1852; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1853; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v4
1854; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1855; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
1856; GFX9-NODL-NEXT:    s_endpgm
1857;
1858; GFX9-DL-LABEL: udot4_acc16_vecMul:
1859; GFX9-DL:       ; %bb.0: ; %entry
1860; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1861; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1862; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0xffff
1863; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1864; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1865; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1866; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
1867; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1868; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 16
1869; GFX9-DL-NEXT:    s_lshr_b32 s7, s3, 16
1870; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 24
1871; GFX9-DL-NEXT:    v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1872; GFX9-DL-NEXT:    s_lshr_b32 s6, s3, 24
1873; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1874; GFX9-DL-NEXT:    v_lshl_or_b32 v4, s6, 16, v4
1875; GFX9-DL-NEXT:    v_lshl_or_b32 v5, s4, 16, v5
1876; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
1877; GFX9-DL-NEXT:    v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1878; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
1879; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
1880; GFX9-DL-NEXT:    v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1881; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
1882; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
1883; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1884; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[0:1]
1885; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1886; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v2
1887; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1888; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v4
1889; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1890; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
1891; GFX9-DL-NEXT:    s_endpgm
1892;
1893; GFX10-DL-LABEL: udot4_acc16_vecMul:
1894; GFX10-DL:       ; %bb.0: ; %entry
1895; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1896; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1897; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1898; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
1899; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1900; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
1901; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1902; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1903; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1904; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
1905; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1906; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
1907; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1908; GFX10-DL-NEXT:    s_lshr_b32 s2, s1, 16
1909; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 16
1910; GFX10-DL-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
1911; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
1912; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
1913; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1914; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1915; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
1916; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
1917; GFX10-DL-NEXT:    v_lshl_or_b32 v4, s1, 16, v5
1918; GFX10-DL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
1919; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
1920; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1921; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v3, v1
1922; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1923; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1924; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1925; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
1926; GFX10-DL-NEXT:    s_endpgm
1927                                              <4 x i8> addrspace(1)* %src2,
1928                                              i16 addrspace(1)* nocapture %dst) {
1929entry:
1930  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1931  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1932
1933  %cvec1 = zext <4 x i8> %vec1 to <4 x i16>
1934  %cvec2 = zext <4 x i8> %vec2 to <4 x i16>
1935
1936  %mul = mul <4 x i16> %cvec1, %cvec2
1937  %mul0 = extractelement <4 x i16> %mul, i64 0
1938  %mul1 = extractelement <4 x i16> %mul, i64 1
1939  %mul2 = extractelement <4 x i16> %mul, i64 2
1940  %mul3 = extractelement <4 x i16> %mul, i64 3
1941
1942  %acc = load i16, i16 addrspace(1)* %dst, align 4
1943  %add1 = add i16 %mul0, %acc
1944  %add2 = add i16 %add1, %mul1
1945  %add3 = add i16 %add2, %mul2
1946  %add4 = add i16 %add3, %mul3
1947
1948  store i16 %add4, i16 addrspace(1)* %dst, align 4
1949  ret void
1950}
1951
1952; TODO: Support this pattern.
1953define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
1954; GFX7-LABEL: udot4_acc8_vecMul:
1955; GFX7:       ; %bb.0: ; %entry
1956; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1957; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1958; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1959; GFX7-NEXT:    s_mov_b32 s2, -1
1960; GFX7-NEXT:    s_movk_i32 s8, 0xff
1961; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1962; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
1963; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1964; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1965; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1966; GFX7-NEXT:    s_bfe_u32 s6, s4, 0x80008
1967; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
1968; GFX7-NEXT:    s_lshr_b32 s11, s5, 16
1969; GFX7-NEXT:    s_lshr_b32 s12, s5, 24
1970; GFX7-NEXT:    v_mov_b32_e32 v3, s10
1971; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
1972; GFX7-NEXT:    v_mov_b32_e32 v2, s11
1973; GFX7-NEXT:    s_lshr_b32 s9, s4, 24
1974; GFX7-NEXT:    v_mov_b32_e32 v1, s12
1975; GFX7-NEXT:    s_mul_i32 s4, s4, s5
1976; GFX7-NEXT:    v_mul_u32_u24_e32 v1, s9, v1
1977; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s7, v2
1978; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s6, v3
1979; GFX7-NEXT:    s_and_b32 s5, s4, s8
1980; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1981; GFX7-NEXT:    v_and_b32_e32 v2, s8, v2
1982; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1983; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
1984; GFX7-NEXT:    v_or_b32_e32 v2, s5, v3
1985; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1986; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1987; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
1988; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
1989; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1990; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1991; GFX7-NEXT:    s_waitcnt vmcnt(0)
1992; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
1993; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1994; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
1995; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1996; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1997; GFX7-NEXT:    s_endpgm
1998;
1999; GFX8-LABEL: udot4_acc8_vecMul:
2000; GFX8:       ; %bb.0: ; %entry
2001; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2002; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2003; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2004; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2005; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2006; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
2007; GFX8-NEXT:    s_movk_i32 s0, 0xff
2008; GFX8-NEXT:    v_mov_b32_e32 v3, s0
2009; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2010; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
2011; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2012; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
2013; GFX8-NEXT:    s_lshr_b32 s4, s1, 24
2014; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
2015; GFX8-NEXT:    v_mov_b32_e32 v4, s0
2016; GFX8-NEXT:    v_mov_b32_e32 v5, s1
2017; GFX8-NEXT:    s_mul_i32 s0, s0, s1
2018; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
2019; GFX8-NEXT:    v_mul_u32_u24_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2020; GFX8-NEXT:    v_mov_b32_e32 v5, s5
2021; GFX8-NEXT:    v_and_b32_e32 v3, s0, v3
2022; GFX8-NEXT:    v_mov_b32_e32 v6, s4
2023; GFX8-NEXT:    v_mov_b32_e32 v7, s2
2024; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
2025; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s3, v5
2026; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2027; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2028; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2029; GFX8-NEXT:    v_or_b32_e32 v4, v3, v4
2030; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
2031; GFX8-NEXT:    s_waitcnt vmcnt(0)
2032; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
2033; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
2034; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2035; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2036; GFX8-NEXT:    flat_store_byte v[0:1], v2
2037; GFX8-NEXT:    s_endpgm
2038;
2039; GFX9-NODL-LABEL: udot4_acc8_vecMul:
2040; GFX9-NODL:       ; %bb.0: ; %entry
2041; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2042; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2043; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2044; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2045; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
2046; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
2047; GFX9-NODL-NEXT:    global_load_ubyte v4, v0, s[0:1]
2048; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2049; GFX9-NODL-NEXT:    s_lshr_b32 s4, s2, 16
2050; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
2051; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
2052; GFX9-NODL-NEXT:    s_lshr_b32 s6, s3, 16
2053; GFX9-NODL-NEXT:    s_lshr_b32 s7, s3, 24
2054; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v1, s2, v1
2055; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2056; GFX9-NODL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2057; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
2058; GFX9-NODL-NEXT:    s_lshr_b32 s5, s2, 24
2059; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
2060; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2061; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v3, s4, v3
2062; GFX9-NODL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2063; GFX9-NODL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2064; GFX9-NODL-NEXT:    v_or_b32_e32 v2, v1, v2
2065; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
2066; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2067; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v4
2068; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v3
2069; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2070; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2071; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
2072; GFX9-NODL-NEXT:    s_endpgm
2073;
2074; GFX9-DL-LABEL: udot4_acc8_vecMul:
2075; GFX9-DL:       ; %bb.0: ; %entry
2076; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2077; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2078; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2079; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2080; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
2081; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
2082; GFX9-DL-NEXT:    global_load_ubyte v4, v0, s[0:1]
2083; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2084; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 16
2085; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
2086; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
2087; GFX9-DL-NEXT:    s_lshr_b32 s6, s3, 16
2088; GFX9-DL-NEXT:    s_lshr_b32 s7, s3, 24
2089; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v1, s2, v1
2090; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2091; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2092; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
2093; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 24
2094; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
2095; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2096; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s4, v3
2097; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2098; GFX9-DL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2099; GFX9-DL-NEXT:    v_or_b32_e32 v2, v1, v2
2100; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
2101; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2102; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v4
2103; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v3
2104; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2105; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2106; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
2107; GFX9-DL-NEXT:    s_endpgm
2108;
2109; GFX10-DL-LABEL: udot4_acc8_vecMul:
2110; GFX10-DL:       ; %bb.0: ; %entry
2111; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2112; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2113; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2114; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2115; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
2116; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2117; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2118; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2119; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s0
2120; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s1
2121; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 24
2122; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 24
2123; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s2, s3
2124; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v2, v2, v3
2125; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s0, s1
2126; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 16
2127; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 16
2128; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 8, v2
2129; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2130; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v4
2131; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s0, s1
2132; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2133; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2134; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v3
2135; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
2136; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2137; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
2138; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v4
2139; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2140; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2141; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
2142; GFX10-DL-NEXT:    s_endpgm
2143                                             <4 x i8> addrspace(1)* %src2,
2144                                             i8 addrspace(1)* nocapture %dst) {
2145entry:
2146  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
2147  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
2148
2149  %mul = mul <4 x i8> %vec1, %vec2
2150  %mul0 = extractelement <4 x i8> %mul, i64 0
2151  %mul1 = extractelement <4 x i8> %mul, i64 1
2152  %mul2 = extractelement <4 x i8> %mul, i64 2
2153  %mul3 = extractelement <4 x i8> %mul, i64 3
2154
2155  %acc = load i8, i8 addrspace(1)* %dst, align 4
2156  %add1 = add i8 %mul0, %acc
2157  %add2 = add i8 %add1, %mul1
2158  %add3 = add i8 %add2, %mul2
2159  %add4 = add i8 %add3, %mul3
2160
2161  store i8 %add4, i8 addrspace(1)* %dst, align 4
2162  ret void
2163}
2164