1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
10
11define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
12; GFX7-LABEL: idot8_acc32:
13; GFX7:       ; %bb.0: ; %entry
14; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
15; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
16; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
17; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
18; GFX7-NEXT:    s_mov_b32 s26, -1
19; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
21; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
22; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
23; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
24; GFX7-NEXT:    s_add_u32 s24, s24, s3
25; GFX7-NEXT:    s_addc_u32 s25, s25, 0
26; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
28; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
29; GFX7-NEXT:    s_bfe_i32 s9, s5, 0x40004
30; GFX7-NEXT:    v_mov_b32_e32 v0, s7
31; GFX7-NEXT:    v_mov_b32_e32 v1, s20
32; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v0, v1
33; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x40004
34; GFX7-NEXT:    v_mov_b32_e32 v1, s9
35; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x40008
36; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
37; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40008
38; GFX7-NEXT:    v_mov_b32_e32 v1, s11
39; GFX7-NEXT:    s_bfe_i32 s13, s5, 0x4000c
40; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
41; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x4000c
42; GFX7-NEXT:    v_mov_b32_e32 v1, s13
43; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40010
44; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
45; GFX7-NEXT:    s_bfe_i32 s14, s4, 0x40010
46; GFX7-NEXT:    v_mov_b32_e32 v1, s15
47; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x40014
48; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x40018
49; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
50; GFX7-NEXT:    s_bfe_i32 s16, s4, 0x40014
51; GFX7-NEXT:    v_mov_b32_e32 v1, s17
52; GFX7-NEXT:    s_bfe_i32 s18, s4, 0x40018
53; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
54; GFX7-NEXT:    v_mov_b32_e32 v1, s19
55; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
56; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
57; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
58; GFX7-NEXT:    v_mov_b32_e32 v1, s5
59; GFX7-NEXT:    s_mov_b32 s3, 0xf000
60; GFX7-NEXT:    s_mov_b32 s2, -1
61; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v1, v0
62; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
63; GFX7-NEXT:    s_endpgm
64;
65; GFX8-LABEL: idot8_acc32:
66; GFX8:       ; %bb.0: ; %entry
67; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
68; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
69; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
70; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
71; GFX8-NEXT:    s_mov_b32 s22, -1
72; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
73; GFX8-NEXT:    s_add_u32 s20, s20, s3
74; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
75; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
76; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
77; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
78; GFX8-NEXT:    s_addc_u32 s21, s21, 0
79; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
80; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x40000
81; GFX8-NEXT:    s_bfe_i32 s5, s3, 0x40000
82; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x40004
83; GFX8-NEXT:    v_mov_b32_e32 v0, s5
84; GFX8-NEXT:    v_mov_b32_e32 v1, s18
85; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
86; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x40004
87; GFX8-NEXT:    v_mov_b32_e32 v1, s7
88; GFX8-NEXT:    s_bfe_i32 s9, s3, 0x40008
89; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
90; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40008
91; GFX8-NEXT:    v_mov_b32_e32 v1, s9
92; GFX8-NEXT:    s_bfe_i32 s11, s3, 0x4000c
93; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
94; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x4000c
95; GFX8-NEXT:    v_mov_b32_e32 v1, s11
96; GFX8-NEXT:    s_bfe_i32 s13, s3, 0x40010
97; GFX8-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
98; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x40010
99; GFX8-NEXT:    v_mov_b32_e32 v1, s13
100; GFX8-NEXT:    s_bfe_i32 s15, s3, 0x40014
101; GFX8-NEXT:    s_bfe_i32 s17, s3, 0x40018
102; GFX8-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
103; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40014
104; GFX8-NEXT:    v_mov_b32_e32 v1, s15
105; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x40018
106; GFX8-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
107; GFX8-NEXT:    v_mov_b32_e32 v1, s17
108; GFX8-NEXT:    s_ashr_i32 s3, s3, 28
109; GFX8-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
110; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
111; GFX8-NEXT:    v_mov_b32_e32 v1, s3
112; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
113; GFX8-NEXT:    v_mov_b32_e32 v0, s0
114; GFX8-NEXT:    v_mov_b32_e32 v1, s1
115; GFX8-NEXT:    flat_store_dword v[0:1], v2
116; GFX8-NEXT:    s_endpgm
117;
118; GFX9-LABEL: idot8_acc32:
119; GFX9:       ; %bb.0: ; %entry
120; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
121; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
122; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
123; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
124; GFX9-NEXT:    s_mov_b32 s22, -1
125; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
126; GFX9-NEXT:    s_add_u32 s20, s20, s3
127; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
129; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
130; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
131; GFX9-NEXT:    s_addc_u32 s21, s21, 0
132; GFX9-NEXT:    v_mov_b32_e32 v0, 0
133; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX9-NEXT:    s_bfe_i32 s4, s2, 0x40000
135; GFX9-NEXT:    s_bfe_i32 s5, s3, 0x40000
136; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40004
137; GFX9-NEXT:    v_mov_b32_e32 v1, s5
138; GFX9-NEXT:    v_mov_b32_e32 v2, s18
139; GFX9-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
140; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40004
141; GFX9-NEXT:    v_mov_b32_e32 v2, s7
142; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40008
143; GFX9-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
144; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40008
145; GFX9-NEXT:    v_mov_b32_e32 v2, s9
146; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x4000c
147; GFX9-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
148; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x4000c
149; GFX9-NEXT:    v_mov_b32_e32 v2, s11
150; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
151; GFX9-NEXT:    v_mad_i32_i24 v1, s10, v2, v1
152; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
153; GFX9-NEXT:    v_mov_b32_e32 v2, s13
154; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
155; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
156; GFX9-NEXT:    v_mad_i32_i24 v1, s12, v2, v1
157; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
158; GFX9-NEXT:    v_mov_b32_e32 v2, s15
159; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
160; GFX9-NEXT:    v_mad_i32_i24 v1, s14, v2, v1
161; GFX9-NEXT:    v_mov_b32_e32 v2, s17
162; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
163; GFX9-NEXT:    v_mad_i32_i24 v1, s16, v2, v1
164; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
165; GFX9-NEXT:    v_mov_b32_e32 v2, s3
166; GFX9-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
167; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
168; GFX9-NEXT:    s_endpgm
169;
170; GFX9-DL-LABEL: idot8_acc32:
171; GFX9-DL:       ; %bb.0: ; %entry
172; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
173; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
174; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
175; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
176; GFX9-DL-NEXT:    s_mov_b32 s10, -1
177; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
178; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
179; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
181; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
182; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
183; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
184; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
185; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
187; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
188; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, s4, v1, v2
189; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
190; GFX9-DL-NEXT:    s_endpgm
191;
192; GFX10-DL-LABEL: idot8_acc32:
193; GFX10-DL:       ; %bb.0: ; %entry
194; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
195; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
196; GFX10-DL-NEXT:    s_mov_b32 s10, -1
197; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
198; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
199; GFX10-DL-NEXT:    s_clause 0x1
200; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
201; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
202; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
203; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
204; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
206; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
207; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
208; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
210; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
211; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
212; GFX10-DL-NEXT:    s_endpgm
213                                       <8 x i4> addrspace(1)* %src2,
214                                       i32 addrspace(1)* nocapture %dst) {
215entry:
216  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
217  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
218
219  %v1e0 = extractelement <8 x i4> %vec1, i64 0
220  %cv1e0 = sext i4 %v1e0 to i32
221  %v2e0 = extractelement <8 x i4> %vec2, i64 0
222  %cv2e0 = sext i4 %v2e0 to i32
223  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
224
225  %v1e1 = extractelement <8 x i4> %vec1, i64 1
226  %cv1e1 = sext i4 %v1e1 to i32
227  %v2e1 = extractelement <8 x i4> %vec2, i64 1
228  %cv2e1 = sext i4 %v2e1 to i32
229  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
230
231  %v1e2 = extractelement <8 x i4> %vec1, i64 2
232  %cv1e2 = sext i4 %v1e2 to i32
233  %v2e2 = extractelement <8 x i4> %vec2, i64 2
234  %cv2e2 = sext i4 %v2e2 to i32
235  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
236
237  %v1e3 = extractelement <8 x i4> %vec1, i64 3
238  %cv1e3 = sext i4 %v1e3 to i32
239  %v2e3 = extractelement <8 x i4> %vec2, i64 3
240  %cv2e3 = sext i4 %v2e3 to i32
241  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
242
243  %v1e4 = extractelement <8 x i4> %vec1, i64 4
244  %cv1e4 = sext i4 %v1e4 to i32
245  %v2e4 = extractelement <8 x i4> %vec2, i64 4
246  %cv2e4 = sext i4 %v2e4 to i32
247  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
248
249  %v1e5 = extractelement <8 x i4> %vec1, i64 5
250  %cv1e5 = sext i4 %v1e5 to i32
251  %v2e5 = extractelement <8 x i4> %vec2, i64 5
252  %cv2e5 = sext i4 %v2e5 to i32
253  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
254
255  %v1e6 = extractelement <8 x i4> %vec1, i64 6
256  %cv1e6 = sext i4 %v1e6 to i32
257  %v2e6 = extractelement <8 x i4> %vec2, i64 6
258  %cv2e6 = sext i4 %v2e6 to i32
259  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
260
261  %v1e7 = extractelement <8 x i4> %vec1, i64 7
262  %cv1e7 = sext i4 %v1e7 to i32
263  %v2e7 = extractelement <8 x i4> %vec2, i64 7
264  %cv2e7 = sext i4 %v2e7 to i32
265  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
266
267  %acc = load i32, i32 addrspace(1)* %dst, align 4
268  %add1 = add i32 %mul0, %acc
269  %add2 = add i32 %add1, %mul1
270  %add3 = add i32 %add2, %mul2
271  %add4 = add i32 %add3, %mul3
272  %add5 = add i32 %add4, %mul4
273  %add6 = add i32 %add5, %mul5
274  %add7 = add i32 %add6, %mul6
275  %add8 = add i32 %add7, %mul7
276
277  store i32 %add8, i32 addrspace(1)* %dst, align 4
278  ret void
279}
280
281; TODO: Once the unnecessary zero extentions of the elements are removed;
282; pattern recognizer will kick in.
283define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
284; GFX7-LABEL: idot8_acc16:
285; GFX7:       ; %bb.0: ; %entry
286; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
287; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
288; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
289; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
290; GFX7-NEXT:    s_mov_b32 s26, -1
291; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
292; GFX7-NEXT:    s_add_u32 s24, s24, s3
293; GFX7-NEXT:    s_mov_b32 s3, 0xf000
294; GFX7-NEXT:    s_mov_b32 s2, -1
295; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
296; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
297; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
298; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
299; GFX7-NEXT:    s_addc_u32 s25, s25, 0
300; GFX7-NEXT:    s_mov_b32 s8, 0xffff
301; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
303; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
304; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x40004
305; GFX7-NEXT:    s_and_b32 s7, s7, s8
306; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40004
307; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x40008
308; GFX7-NEXT:    s_and_b32 s10, s10, s8
309; GFX7-NEXT:    s_and_b32 s6, s6, s8
310; GFX7-NEXT:    v_mov_b32_e32 v1, s7
311; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40008
312; GFX7-NEXT:    s_bfe_i32 s14, s5, 0x4000c
313; GFX7-NEXT:    s_and_b32 s12, s12, s8
314; GFX7-NEXT:    s_and_b32 s9, s9, s8
315; GFX7-NEXT:    v_mov_b32_e32 v2, s10
316; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x4000c
317; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40010
318; GFX7-NEXT:    s_and_b32 s14, s14, s8
319; GFX7-NEXT:    s_and_b32 s11, s11, s8
320; GFX7-NEXT:    v_mov_b32_e32 v3, s12
321; GFX7-NEXT:    s_bfe_i32 s15, s4, 0x40010
322; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x40014
323; GFX7-NEXT:    s_and_b32 s16, s16, s8
324; GFX7-NEXT:    s_and_b32 s13, s13, s8
325; GFX7-NEXT:    v_mov_b32_e32 v4, s14
326; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40018
327; GFX7-NEXT:    s_bfe_i32 s17, s4, 0x40014
328; GFX7-NEXT:    s_and_b32 s18, s18, s8
329; GFX7-NEXT:    s_and_b32 s15, s15, s8
330; GFX7-NEXT:    v_mov_b32_e32 v5, s16
331; GFX7-NEXT:    s_bfe_i32 s19, s4, 0x40018
332; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
333; GFX7-NEXT:    s_and_b32 s20, s20, s8
334; GFX7-NEXT:    s_and_b32 s17, s17, s8
335; GFX7-NEXT:    v_mov_b32_e32 v6, s18
336; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
337; GFX7-NEXT:    s_and_b32 s19, s19, s8
338; GFX7-NEXT:    s_and_b32 s5, s5, s8
339; GFX7-NEXT:    v_mov_b32_e32 v7, s20
340; GFX7-NEXT:    s_and_b32 s4, s4, s8
341; GFX7-NEXT:    s_waitcnt vmcnt(0)
342; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
343; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
344; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
345; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v4, v0
346; GFX7-NEXT:    v_mad_u32_u24 v0, s15, v5, v0
347; GFX7-NEXT:    v_mad_u32_u24 v0, s17, v6, v0
348; GFX7-NEXT:    v_mad_u32_u24 v0, s19, v7, v0
349; GFX7-NEXT:    v_mov_b32_e32 v1, s5
350; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
351; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
352; GFX7-NEXT:    s_endpgm
353;
354; GFX8-LABEL: idot8_acc16:
355; GFX8:       ; %bb.0: ; %entry
356; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
357; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
358; GFX8-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
359; GFX8-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
360; GFX8-NEXT:    s_mov_b32 s18, -1
361; GFX8-NEXT:    s_mov_b32 s19, 0xe80000
362; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX8-NEXT:    v_mov_b32_e32 v0, s0
364; GFX8-NEXT:    v_mov_b32_e32 v1, s1
365; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
366; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
367; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
368; GFX8-NEXT:    s_add_u32 s16, s16, s3
369; GFX8-NEXT:    s_addc_u32 s17, s17, 0
370; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
371; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x40000
372; GFX8-NEXT:    s_bfe_i32 s5, s1, 0x40000
373; GFX8-NEXT:    s_bfe_i32 s7, s1, 0x40004
374; GFX8-NEXT:    s_bfe_i32 s9, s1, 0x40008
375; GFX8-NEXT:    v_mov_b32_e32 v6, s5
376; GFX8-NEXT:    s_lshr_b32 s2, s0, 12
377; GFX8-NEXT:    s_lshr_b32 s3, s1, 12
378; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x40004
379; GFX8-NEXT:    s_bfe_i32 s8, s0, 0x40008
380; GFX8-NEXT:    v_mov_b32_e32 v3, s9
381; GFX8-NEXT:    v_mov_b32_e32 v7, s7
382; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
383; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
384; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s8, v3
385; GFX8-NEXT:    s_bfe_i32 s11, s1, 0x40010
386; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
387; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
388; GFX8-NEXT:    s_bfe_i32 s13, s1, 0x40014
389; GFX8-NEXT:    s_bfe_i32 s10, s0, 0x40010
390; GFX8-NEXT:    v_mov_b32_e32 v8, s11
391; GFX8-NEXT:    s_bfe_i32 s15, s1, 0x40018
392; GFX8-NEXT:    s_bfe_i32 s12, s0, 0x40014
393; GFX8-NEXT:    v_mov_b32_e32 v9, s13
394; GFX8-NEXT:    s_bfe_i32 s14, s0, 0x40018
395; GFX8-NEXT:    s_ashr_i32 s1, s1, 28
396; GFX8-NEXT:    v_mov_b32_e32 v10, s15
397; GFX8-NEXT:    s_ashr_i32 s0, s0, 28
398; GFX8-NEXT:    s_waitcnt vmcnt(0)
399; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
400; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v7, v2
401; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
402; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
403; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v8, v2
404; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v9, v2
405; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v10, v2
406; GFX8-NEXT:    v_mov_b32_e32 v3, s1
407; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
408; GFX8-NEXT:    flat_store_short v[0:1], v2
409; GFX8-NEXT:    s_endpgm
410;
411; GFX9-LABEL: idot8_acc16:
412; GFX9:       ; %bb.0: ; %entry
413; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
414; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
415; GFX9-NEXT:    v_mov_b32_e32 v0, 0
416; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
417; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
418; GFX9-NEXT:    s_mov_b32 s22, -1
419; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
421; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
422; GFX9-NEXT:    s_add_u32 s20, s20, s3
423; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
424; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
425; GFX9-NEXT:    s_addc_u32 s21, s21, 0
426; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40000
428; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40000
429; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40004
430; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x40008
431; GFX9-NEXT:    v_mov_b32_e32 v5, s7
432; GFX9-NEXT:    s_lshr_b32 s4, s2, 12
433; GFX9-NEXT:    s_lshr_b32 s5, s3, 12
434; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40004
435; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x40008
436; GFX9-NEXT:    v_mov_b32_e32 v2, s11
437; GFX9-NEXT:    v_mov_b32_e32 v6, s9
438; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
439; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s5
440; GFX9-NEXT:    v_mul_i32_i24_e32 v2, s10, v2
441; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
442; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
443; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
444; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
445; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
446; GFX9-NEXT:    v_mov_b32_e32 v7, s13
447; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
448; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
449; GFX9-NEXT:    v_mov_b32_e32 v8, s15
450; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
451; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
452; GFX9-NEXT:    v_mov_b32_e32 v9, s17
453; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
454; GFX9-NEXT:    s_waitcnt vmcnt(0)
455; GFX9-NEXT:    v_mad_i32_i24 v1, s6, v5, v1
456; GFX9-NEXT:    v_mad_i32_i24 v1, s8, v6, v1
457; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
458; GFX9-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
459; GFX9-NEXT:    v_mad_i32_i24 v1, s12, v7, v1
460; GFX9-NEXT:    v_mad_i32_i24 v1, s14, v8, v1
461; GFX9-NEXT:    v_mad_i32_i24 v1, s16, v9, v1
462; GFX9-NEXT:    v_mov_b32_e32 v2, s3
463; GFX9-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
464; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
465; GFX9-NEXT:    s_endpgm
466;
467; GFX9-DL-LABEL: idot8_acc16:
468; GFX9-DL:       ; %bb.0: ; %entry
469; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
470; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
471; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
472; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
473; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
474; GFX9-DL-NEXT:    s_mov_b32 s22, -1
475; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
476; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
477; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
478; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
479; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
480; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
481; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
482; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x40000
484; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x40000
485; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x40004
486; GFX9-DL-NEXT:    s_bfe_i32 s11, s3, 0x40008
487; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
488; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 12
489; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 12
490; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40004
491; GFX9-DL-NEXT:    s_bfe_i32 s10, s2, 0x40008
492; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
493; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s9
494; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
495; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s5
496; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, s10, v2
497; GFX9-DL-NEXT:    s_bfe_i32 s13, s3, 0x40010
498; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
499; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
500; GFX9-DL-NEXT:    s_bfe_i32 s15, s3, 0x40014
501; GFX9-DL-NEXT:    s_bfe_i32 s12, s2, 0x40010
502; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
503; GFX9-DL-NEXT:    s_bfe_i32 s17, s3, 0x40018
504; GFX9-DL-NEXT:    s_bfe_i32 s14, s2, 0x40014
505; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s15
506; GFX9-DL-NEXT:    s_bfe_i32 s16, s2, 0x40018
507; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
508; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s17
509; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
510; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
511; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v5, v1
512; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s8, v6, v1
513; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
514; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
515; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s12, v7, v1
516; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s14, v8, v1
517; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s16, v9, v1
518; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
519; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
520; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
521; GFX9-DL-NEXT:    s_endpgm
522;
523; GFX10-DL-LABEL: idot8_acc16:
524; GFX10-DL:       ; %bb.0: ; %entry
525; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
526; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
527; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
528; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
529; GFX10-DL-NEXT:    s_mov_b32 s14, -1
530; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
531; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
532; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
533; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
534; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
536; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
537; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
538; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
540; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
541; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
542; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
543; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s2
544; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s3
545; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
546; GFX10-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
547; GFX10-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
548; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
549; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v4, s9, s10
550; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
551; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
552; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
553; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
554; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
555; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s8, s2, v1
556; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
557; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
558; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
559; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
560; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
561; GFX10-DL-NEXT:    v_mad_u32_u24 v1, v2, v3, v1
562; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
563; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
564; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
565; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
566; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
567; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
568; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
569; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
570; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
571; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
572; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
573; GFX10-DL-NEXT:    s_endpgm
574                                       <8 x i4> addrspace(1)* %src2,
575                                       i16 addrspace(1)* nocapture %dst) {
576entry:
577  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
578  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
579
580  %v1e0 = extractelement <8 x i4> %vec1, i64 0
581  %cv1e0 = sext i4 %v1e0 to i16
582  %v2e0 = extractelement <8 x i4> %vec2, i64 0
583  %cv2e0 = sext i4 %v2e0 to i16
584  %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
585
586  %v1e1 = extractelement <8 x i4> %vec1, i64 1
587  %cv1e1 = sext i4 %v1e1 to i16
588  %v2e1 = extractelement <8 x i4> %vec2, i64 1
589  %cv2e1 = sext i4 %v2e1 to i16
590  %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
591
592  %v1e2 = extractelement <8 x i4> %vec1, i64 2
593  %cv1e2 = sext i4 %v1e2 to i16
594  %v2e2 = extractelement <8 x i4> %vec2, i64 2
595  %cv2e2 = sext i4 %v2e2 to i16
596  %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
597
598  %v1e3 = extractelement <8 x i4> %vec1, i64 3
599  %cv1e3 = sext i4 %v1e3 to i16
600  %v2e3 = extractelement <8 x i4> %vec2, i64 3
601  %cv2e3 = sext i4 %v2e3 to i16
602  %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
603
604  %v1e4 = extractelement <8 x i4> %vec1, i64 4
605  %cv1e4 = sext i4 %v1e4 to i16
606  %v2e4 = extractelement <8 x i4> %vec2, i64 4
607  %cv2e4 = sext i4 %v2e4 to i16
608  %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
609
610  %v1e5 = extractelement <8 x i4> %vec1, i64 5
611  %cv1e5 = sext i4 %v1e5 to i16
612  %v2e5 = extractelement <8 x i4> %vec2, i64 5
613  %cv2e5 = sext i4 %v2e5 to i16
614  %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
615
616  %v1e6 = extractelement <8 x i4> %vec1, i64 6
617  %cv1e6 = sext i4 %v1e6 to i16
618  %v2e6 = extractelement <8 x i4> %vec2, i64 6
619  %cv2e6 = sext i4 %v2e6 to i16
620  %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
621
622  %v1e7 = extractelement <8 x i4> %vec1, i64 7
623  %cv1e7 = sext i4 %v1e7 to i16
624  %v2e7 = extractelement <8 x i4> %vec2, i64 7
625  %cv2e7 = sext i4 %v2e7 to i16
626  %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
627
628  %acc = load i16, i16 addrspace(1)* %dst, align 4
629  %add1 = add i16 %mul0, %acc
630  %add2 = add i16 %add1, %mul1
631  %add3 = add i16 %add2, %mul2
632  %add4 = add i16 %add3, %mul3
633  %add5 = add i16 %add4, %mul4
634  %add6 = add i16 %add5, %mul5
635  %add7 = add i16 %add6, %mul6
636  %add8 = add i16 %add7, %mul7
637
638  store i16 %add8, i16 addrspace(1)* %dst, align 4
639  ret void
640}
641
642; TODO: Support this pattern.
643define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
644; GFX7-LABEL: idot8_acc8:
645; GFX7:       ; %bb.0: ; %entry
646; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
647; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
648; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
649; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
650; GFX7-NEXT:    s_mov_b32 s26, -1
651; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
652; GFX7-NEXT:    s_add_u32 s24, s24, s3
653; GFX7-NEXT:    s_mov_b32 s3, 0xf000
654; GFX7-NEXT:    s_mov_b32 s2, -1
655; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
657; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
658; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
659; GFX7-NEXT:    s_addc_u32 s25, s25, 0
660; GFX7-NEXT:    s_movk_i32 s8, 0xff
661; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
662; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
663; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
664; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x40004
665; GFX7-NEXT:    s_and_b32 s7, s7, s8
666; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40004
667; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x40008
668; GFX7-NEXT:    s_and_b32 s10, s10, s8
669; GFX7-NEXT:    s_and_b32 s6, s6, s8
670; GFX7-NEXT:    v_mov_b32_e32 v1, s7
671; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40008
672; GFX7-NEXT:    s_bfe_i32 s14, s5, 0x4000c
673; GFX7-NEXT:    s_and_b32 s12, s12, s8
674; GFX7-NEXT:    s_and_b32 s9, s9, s8
675; GFX7-NEXT:    v_mov_b32_e32 v2, s10
676; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x4000c
677; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40010
678; GFX7-NEXT:    s_and_b32 s14, s14, s8
679; GFX7-NEXT:    s_and_b32 s11, s11, s8
680; GFX7-NEXT:    v_mov_b32_e32 v3, s12
681; GFX7-NEXT:    s_bfe_i32 s15, s4, 0x40010
682; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x40014
683; GFX7-NEXT:    s_and_b32 s16, s16, s8
684; GFX7-NEXT:    s_and_b32 s13, s13, s8
685; GFX7-NEXT:    v_mov_b32_e32 v4, s14
686; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40018
687; GFX7-NEXT:    s_bfe_i32 s17, s4, 0x40014
688; GFX7-NEXT:    s_and_b32 s18, s18, s8
689; GFX7-NEXT:    s_and_b32 s15, s15, s8
690; GFX7-NEXT:    v_mov_b32_e32 v5, s16
691; GFX7-NEXT:    s_bfe_i32 s19, s4, 0x40018
692; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
693; GFX7-NEXT:    s_and_b32 s20, s20, s8
694; GFX7-NEXT:    s_and_b32 s17, s17, s8
695; GFX7-NEXT:    v_mov_b32_e32 v6, s18
696; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
697; GFX7-NEXT:    s_and_b32 s19, s19, s8
698; GFX7-NEXT:    s_and_b32 s5, s5, s8
699; GFX7-NEXT:    v_mov_b32_e32 v7, s20
700; GFX7-NEXT:    s_and_b32 s4, s4, s8
701; GFX7-NEXT:    s_waitcnt vmcnt(0)
702; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
703; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
704; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
705; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v4, v0
706; GFX7-NEXT:    v_mad_u32_u24 v0, s15, v5, v0
707; GFX7-NEXT:    v_mad_u32_u24 v0, s17, v6, v0
708; GFX7-NEXT:    v_mad_u32_u24 v0, s19, v7, v0
709; GFX7-NEXT:    v_mov_b32_e32 v1, s5
710; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
711; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
712; GFX7-NEXT:    s_endpgm
713;
714; GFX8-LABEL: idot8_acc8:
715; GFX8:       ; %bb.0: ; %entry
716; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
717; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
718; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
719; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
720; GFX8-NEXT:    s_mov_b32 s22, -1
721; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
722; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
723; GFX8-NEXT:    v_mov_b32_e32 v0, s0
724; GFX8-NEXT:    v_mov_b32_e32 v1, s1
725; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
726; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
727; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
728; GFX8-NEXT:    s_add_u32 s20, s20, s3
729; GFX8-NEXT:    s_addc_u32 s21, s21, 0
730; GFX8-NEXT:    s_movk_i32 s0, 0xff
731; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX8-NEXT:    s_lshr_b32 s3, s1, 12
733; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x40000
734; GFX8-NEXT:    s_lshr_b32 s4, s2, 12
735; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40004
736; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x40008
737; GFX8-NEXT:    s_bfe_i32 s5, s1, 0x40000
738; GFX8-NEXT:    v_mov_b32_e32 v6, s6
739; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s3
740; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
741; GFX8-NEXT:    s_bfe_i32 s7, s1, 0x40004
742; GFX8-NEXT:    s_bfe_i32 s9, s1, 0x40008
743; GFX8-NEXT:    v_mov_b32_e32 v3, s10
744; GFX8-NEXT:    v_mov_b32_e32 v7, s8
745; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
746; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
747; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
748; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x40010
749; GFX8-NEXT:    v_and_b32_e32 v4, s0, v4
750; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
751; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40014
752; GFX8-NEXT:    s_bfe_i32 s11, s1, 0x40010
753; GFX8-NEXT:    v_mov_b32_e32 v8, s12
754; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x40018
755; GFX8-NEXT:    s_bfe_i32 s13, s1, 0x40014
756; GFX8-NEXT:    v_mov_b32_e32 v9, s14
757; GFX8-NEXT:    s_bfe_i32 s15, s1, 0x40018
758; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
759; GFX8-NEXT:    v_mov_b32_e32 v10, s16
760; GFX8-NEXT:    s_ashr_i32 s1, s1, 28
761; GFX8-NEXT:    s_waitcnt vmcnt(0)
762; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
763; GFX8-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
764; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
765; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
766; GFX8-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
767; GFX8-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
768; GFX8-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
769; GFX8-NEXT:    v_mov_b32_e32 v3, s2
770; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
771; GFX8-NEXT:    flat_store_byte v[0:1], v2
772; GFX8-NEXT:    s_endpgm
773;
774; GFX9-LABEL: idot8_acc8:
775; GFX9:       ; %bb.0: ; %entry
776; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
777; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
778; GFX9-NEXT:    v_mov_b32_e32 v0, 0
779; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
780; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
781; GFX9-NEXT:    s_mov_b32 s22, -1
782; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
783; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
784; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
785; GFX9-NEXT:    s_add_u32 s20, s20, s3
786; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
787; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
788; GFX9-NEXT:    s_addc_u32 s21, s21, 0
789; GFX9-NEXT:    s_movk_i32 s2, 0xff
790; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
791; GFX9-NEXT:    s_lshr_b32 s5, s3, 12
792; GFX9-NEXT:    s_bfe_i32 s8, s4, 0x40000
793; GFX9-NEXT:    s_lshr_b32 s6, s4, 12
794; GFX9-NEXT:    s_bfe_i32 s10, s4, 0x40004
795; GFX9-NEXT:    s_bfe_i32 s12, s4, 0x40008
796; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40000
797; GFX9-NEXT:    v_mov_b32_e32 v5, s8
798; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 12, s5
799; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s6
800; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40004
801; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x40008
802; GFX9-NEXT:    v_mov_b32_e32 v2, s12
803; GFX9-NEXT:    v_mov_b32_e32 v6, s10
804; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
805; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
806; GFX9-NEXT:    v_mul_i32_i24_e32 v2, s11, v2
807; GFX9-NEXT:    s_bfe_i32 s14, s4, 0x40010
808; GFX9-NEXT:    v_and_b32_e32 v3, s2, v3
809; GFX9-NEXT:    v_and_b32_e32 v4, s2, v4
810; GFX9-NEXT:    s_bfe_i32 s16, s4, 0x40014
811; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
812; GFX9-NEXT:    v_mov_b32_e32 v7, s14
813; GFX9-NEXT:    s_bfe_i32 s18, s4, 0x40018
814; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
815; GFX9-NEXT:    v_mov_b32_e32 v8, s16
816; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
817; GFX9-NEXT:    s_ashr_i32 s4, s4, 28
818; GFX9-NEXT:    v_mov_b32_e32 v9, s18
819; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
820; GFX9-NEXT:    s_waitcnt vmcnt(0)
821; GFX9-NEXT:    v_mad_i32_i24 v1, s7, v5, v1
822; GFX9-NEXT:    v_mad_i32_i24 v1, s9, v6, v1
823; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
824; GFX9-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
825; GFX9-NEXT:    v_mad_i32_i24 v1, s13, v7, v1
826; GFX9-NEXT:    v_mad_i32_i24 v1, s15, v8, v1
827; GFX9-NEXT:    v_mad_i32_i24 v1, s17, v9, v1
828; GFX9-NEXT:    v_mov_b32_e32 v2, s4
829; GFX9-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
830; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
831; GFX9-NEXT:    s_endpgm
832;
833; GFX9-DL-LABEL: idot8_acc8:
834; GFX9-DL:       ; %bb.0: ; %entry
835; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
836; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
837; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
838; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
839; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
840; GFX9-DL-NEXT:    s_mov_b32 s22, -1
841; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
842; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
843; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
844; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
845; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
846; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
847; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
848; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
849; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
850; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 12
851; GFX9-DL-NEXT:    s_bfe_i32 s8, s4, 0x40000
852; GFX9-DL-NEXT:    s_lshr_b32 s6, s4, 12
853; GFX9-DL-NEXT:    s_bfe_i32 s10, s4, 0x40004
854; GFX9-DL-NEXT:    s_bfe_i32 s12, s4, 0x40008
855; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x40000
856; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s8
857; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s5
858; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s6
859; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x40004
860; GFX9-DL-NEXT:    s_bfe_i32 s11, s3, 0x40008
861; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s12
862; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s10
863; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
864; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
865; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, s11, v2
866; GFX9-DL-NEXT:    s_bfe_i32 s14, s4, 0x40010
867; GFX9-DL-NEXT:    v_and_b32_e32 v3, s2, v3
868; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v4
869; GFX9-DL-NEXT:    s_bfe_i32 s16, s4, 0x40014
870; GFX9-DL-NEXT:    s_bfe_i32 s13, s3, 0x40010
871; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s14
872; GFX9-DL-NEXT:    s_bfe_i32 s18, s4, 0x40018
873; GFX9-DL-NEXT:    s_bfe_i32 s15, s3, 0x40014
874; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s16
875; GFX9-DL-NEXT:    s_bfe_i32 s17, s3, 0x40018
876; GFX9-DL-NEXT:    s_ashr_i32 s4, s4, 28
877; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s18
878; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
879; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
880; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s7, v5, v1
881; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s9, v6, v1
882; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
883; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
884; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s13, v7, v1
885; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s15, v8, v1
886; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s17, v9, v1
887; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
888; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
889; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
890; GFX9-DL-NEXT:    s_endpgm
891;
892; GFX10-DL-LABEL: idot8_acc8:
893; GFX10-DL:       ; %bb.0: ; %entry
894; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
895; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
896; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
897; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
898; GFX10-DL-NEXT:    s_mov_b32 s14, -1
899; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
900; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
901; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
902; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
903; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
904; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
905; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
906; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
907; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
908; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
909; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
910; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
911; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
912; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s2
913; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s3
914; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
915; GFX10-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
916; GFX10-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
917; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
918; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v4, s9, s10
919; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
920; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
921; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
922; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
923; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
924; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s8, s2, v1
925; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
926; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
927; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
928; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
929; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
930; GFX10-DL-NEXT:    v_mad_u32_u24 v1, v2, v3, v1
931; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
932; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
933; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
934; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
935; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
936; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
937; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
938; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
939; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
940; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
941; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
942; GFX10-DL-NEXT:    s_endpgm
943                                       <8 x i4> addrspace(1)* %src2,
944                                       i8 addrspace(1)* nocapture %dst) {
945entry:
946  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
947  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
948
949  %v1e0 = extractelement <8 x i4> %vec1, i64 0
950  %cv1e0 = sext i4 %v1e0 to i8
951  %v2e0 = extractelement <8 x i4> %vec2, i64 0
952  %cv2e0 = sext i4 %v2e0 to i8
953  %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
954
955  %v1e1 = extractelement <8 x i4> %vec1, i64 1
956  %cv1e1 = sext i4 %v1e1 to i8
957  %v2e1 = extractelement <8 x i4> %vec2, i64 1
958  %cv2e1 = sext i4 %v2e1 to i8
959  %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
960
961  %v1e2 = extractelement <8 x i4> %vec1, i64 2
962  %cv1e2 = sext i4 %v1e2 to i8
963  %v2e2 = extractelement <8 x i4> %vec2, i64 2
964  %cv2e2 = sext i4 %v2e2 to i8
965  %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
966
967  %v1e3 = extractelement <8 x i4> %vec1, i64 3
968  %cv1e3 = sext i4 %v1e3 to i8
969  %v2e3 = extractelement <8 x i4> %vec2, i64 3
970  %cv2e3 = sext i4 %v2e3 to i8
971  %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
972
973  %v1e4 = extractelement <8 x i4> %vec1, i64 4
974  %cv1e4 = sext i4 %v1e4 to i8
975  %v2e4 = extractelement <8 x i4> %vec2, i64 4
976  %cv2e4 = sext i4 %v2e4 to i8
977  %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
978
979  %v1e5 = extractelement <8 x i4> %vec1, i64 5
980  %cv1e5 = sext i4 %v1e5 to i8
981  %v2e5 = extractelement <8 x i4> %vec2, i64 5
982  %cv2e5 = sext i4 %v2e5 to i8
983  %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
984
985  %v1e6 = extractelement <8 x i4> %vec1, i64 6
986  %cv1e6 = sext i4 %v1e6 to i8
987  %v2e6 = extractelement <8 x i4> %vec2, i64 6
988  %cv2e6 = sext i4 %v2e6 to i8
989  %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
990
991  %v1e7 = extractelement <8 x i4> %vec1, i64 7
992  %cv1e7 = sext i4 %v1e7 to i8
993  %v2e7 = extractelement <8 x i4> %vec2, i64 7
994  %cv2e7 = sext i4 %v2e7 to i8
995  %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
996
997  %acc = load i8, i8 addrspace(1)* %dst, align 4
998  %add1 = add i8 %mul0, %acc
999  %add2 = add i8 %add1, %mul1
1000  %add3 = add i8 %add2, %mul2
1001  %add4 = add i8 %add3, %mul3
1002  %add5 = add i8 %add4, %mul4
1003  %add6 = add i8 %add5, %mul5
1004  %add7 = add i8 %add6, %mul6
1005  %add8 = add i8 %add7, %mul7
1006
1007  store i8 %add8, i8 addrspace(1)* %dst, align 4
1008  ret void
1009}
1010
1011; Make sure the pattern is not recognized if there are multiple uses of the
1012; intermediate multiplications.
1013define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1014; GFX7-LABEL: idot8_multiuses_mul1:
1015; GFX7:       ; %bb.0: ; %entry
1016; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1017; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1018; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1019; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1020; GFX7-NEXT:    s_mov_b32 s26, -1
1021; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1022; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1023; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1024; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
1025; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
1026; GFX7-NEXT:    s_add_u32 s24, s24, s3
1027; GFX7-NEXT:    s_addc_u32 s25, s25, 0
1028; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1029; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
1030; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
1031; GFX7-NEXT:    v_mov_b32_e32 v0, s7
1032; GFX7-NEXT:    v_mov_b32_e32 v1, s20
1033; GFX7-NEXT:    v_mad_i32_i24 v1, s6, v0, v1
1034; GFX7-NEXT:    s_bfe_i32 s9, s5, 0x40004
1035; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x40004
1036; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x40008
1037; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v0, v1
1038; GFX7-NEXT:    v_mov_b32_e32 v2, s9
1039; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
1040; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40008
1041; GFX7-NEXT:    v_mov_b32_e32 v2, s11
1042; GFX7-NEXT:    s_bfe_i32 s13, s5, 0x4000c
1043; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
1044; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x4000c
1045; GFX7-NEXT:    v_mov_b32_e32 v2, s13
1046; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40010
1047; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
1048; GFX7-NEXT:    s_bfe_i32 s14, s4, 0x40010
1049; GFX7-NEXT:    v_mov_b32_e32 v2, s15
1050; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x40014
1051; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x40018
1052; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
1053; GFX7-NEXT:    s_bfe_i32 s16, s4, 0x40014
1054; GFX7-NEXT:    v_mov_b32_e32 v2, s17
1055; GFX7-NEXT:    s_bfe_i32 s18, s4, 0x40018
1056; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
1057; GFX7-NEXT:    v_mov_b32_e32 v2, s19
1058; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
1059; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v2, v0
1060; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
1061; GFX7-NEXT:    v_mov_b32_e32 v2, s5
1062; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v2, v0
1063; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1064; GFX7-NEXT:    s_mov_b32 s2, -1
1065; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1066; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1067; GFX7-NEXT:    s_endpgm
1068;
1069; GFX8-LABEL: idot8_multiuses_mul1:
1070; GFX8:       ; %bb.0: ; %entry
1071; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1072; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1073; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1074; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1075; GFX8-NEXT:    s_mov_b32 s22, -1
1076; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
1077; GFX8-NEXT:    s_add_u32 s20, s20, s3
1078; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1079; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
1080; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
1081; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
1082; GFX8-NEXT:    s_addc_u32 s21, s21, 0
1083; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x40000
1085; GFX8-NEXT:    s_bfe_i32 s5, s3, 0x40000
1086; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1087; GFX8-NEXT:    v_mov_b32_e32 v1, s18
1088; GFX8-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
1089; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x40004
1090; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x40004
1091; GFX8-NEXT:    s_bfe_i32 s9, s3, 0x40008
1092; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
1093; GFX8-NEXT:    v_mov_b32_e32 v2, s7
1094; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v2, v0
1095; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40008
1096; GFX8-NEXT:    v_mov_b32_e32 v2, s9
1097; GFX8-NEXT:    s_bfe_i32 s11, s3, 0x4000c
1098; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
1099; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x4000c
1100; GFX8-NEXT:    v_mov_b32_e32 v2, s11
1101; GFX8-NEXT:    s_bfe_i32 s13, s3, 0x40010
1102; GFX8-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
1103; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x40010
1104; GFX8-NEXT:    v_mov_b32_e32 v2, s13
1105; GFX8-NEXT:    s_bfe_i32 s15, s3, 0x40014
1106; GFX8-NEXT:    s_bfe_i32 s17, s3, 0x40018
1107; GFX8-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
1108; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40014
1109; GFX8-NEXT:    v_mov_b32_e32 v2, s15
1110; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x40018
1111; GFX8-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
1112; GFX8-NEXT:    v_mov_b32_e32 v2, s17
1113; GFX8-NEXT:    s_ashr_i32 s3, s3, 28
1114; GFX8-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
1115; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
1116; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1117; GFX8-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
1118; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
1119; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1120; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1121; GFX8-NEXT:    flat_store_dword v[0:1], v2
1122; GFX8-NEXT:    s_endpgm
1123;
1124; GFX9-LABEL: idot8_multiuses_mul1:
1125; GFX9:       ; %bb.0: ; %entry
1126; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1127; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1128; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1129; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1130; GFX9-NEXT:    s_mov_b32 s22, -1
1131; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
1132; GFX9-NEXT:    s_add_u32 s20, s20, s3
1133; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1134; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
1135; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
1136; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
1137; GFX9-NEXT:    s_addc_u32 s21, s21, 0
1138; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1140; GFX9-NEXT:    s_bfe_i32 s4, s2, 0x40000
1141; GFX9-NEXT:    s_bfe_i32 s5, s3, 0x40000
1142; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1143; GFX9-NEXT:    v_mov_b32_e32 v2, s18
1144; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v1, v2
1145; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40004
1146; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40004
1147; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40008
1148; GFX9-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
1149; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1150; GFX9-NEXT:    v_mad_i32_i24 v1, s6, v3, v1
1151; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40008
1152; GFX9-NEXT:    v_mov_b32_e32 v3, s9
1153; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x4000c
1154; GFX9-NEXT:    v_mad_i32_i24 v1, s8, v3, v1
1155; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x4000c
1156; GFX9-NEXT:    v_mov_b32_e32 v3, s11
1157; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
1158; GFX9-NEXT:    v_mad_i32_i24 v1, s10, v3, v1
1159; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
1160; GFX9-NEXT:    v_mov_b32_e32 v3, s13
1161; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
1162; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
1163; GFX9-NEXT:    v_mad_i32_i24 v1, s12, v3, v1
1164; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
1165; GFX9-NEXT:    v_mov_b32_e32 v3, s15
1166; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
1167; GFX9-NEXT:    v_mad_i32_i24 v1, s14, v3, v1
1168; GFX9-NEXT:    v_mov_b32_e32 v3, s17
1169; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
1170; GFX9-NEXT:    v_mad_i32_i24 v1, s16, v3, v1
1171; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
1172; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1173; GFX9-NEXT:    v_mad_i32_i24 v1, s2, v3, v1
1174; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
1175; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1176; GFX9-NEXT:    s_endpgm
1177;
1178; GFX9-DL-LABEL: idot8_multiuses_mul1:
1179; GFX9-DL:       ; %bb.0: ; %entry
1180; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1181; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1182; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1183; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1184; GFX9-DL-NEXT:    s_mov_b32 s22, -1
1185; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
1186; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
1187; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1188; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1189; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
1190; GFX9-DL-NEXT:    s_load_dword s18, s[0:1], 0x0
1191; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
1192; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1193; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1194; GFX9-DL-NEXT:    s_bfe_i32 s4, s2, 0x40000
1195; GFX9-DL-NEXT:    s_bfe_i32 s5, s3, 0x40000
1196; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
1197; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s18
1198; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v1, v2
1199; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x40004
1200; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x40004
1201; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x40008
1202; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
1203; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
1204; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v3, v1
1205; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40008
1206; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
1207; GFX9-DL-NEXT:    s_bfe_i32 s11, s3, 0x4000c
1208; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s8, v3, v1
1209; GFX9-DL-NEXT:    s_bfe_i32 s10, s2, 0x4000c
1210; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
1211; GFX9-DL-NEXT:    s_bfe_i32 s13, s3, 0x40010
1212; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s10, v3, v1
1213; GFX9-DL-NEXT:    s_bfe_i32 s12, s2, 0x40010
1214; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s13
1215; GFX9-DL-NEXT:    s_bfe_i32 s15, s3, 0x40014
1216; GFX9-DL-NEXT:    s_bfe_i32 s17, s3, 0x40018
1217; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s12, v3, v1
1218; GFX9-DL-NEXT:    s_bfe_i32 s14, s2, 0x40014
1219; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s15
1220; GFX9-DL-NEXT:    s_bfe_i32 s16, s2, 0x40018
1221; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s14, v3, v1
1222; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
1223; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
1224; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s16, v3, v1
1225; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
1226; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
1227; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s2, v3, v1
1228; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
1229; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1230; GFX9-DL-NEXT:    s_endpgm
1231;
1232; GFX10-DL-LABEL: idot8_multiuses_mul1:
1233; GFX10-DL:       ; %bb.0: ; %entry
1234; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1235; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1236; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1237; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1238; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1239; GFX10-DL-NEXT:    s_clause 0x1
1240; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1241; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1242; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1243; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1244; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1245; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1246; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1247; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1248; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1249; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1250; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40000
1251; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40000
1252; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
1253; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v0
1254; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40004
1255; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40004
1256; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1257; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40008
1258; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40008
1259; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1260; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x4000c
1261; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x4000c
1262; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1263; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
1264; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
1265; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1266; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
1267; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
1268; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1269; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
1270; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
1271; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
1272; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
1273; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1274; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
1275; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v0, v1
1276; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
1277; GFX10-DL-NEXT:    s_endpgm
1278                                                <8 x i4> addrspace(1)* %src2,
1279                                                i32 addrspace(1)* nocapture %dst) {
1280entry:
1281  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1282  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1283
1284  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1285  %cv1e0 = sext i4 %v1e0 to i32
1286  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1287  %cv2e0 = sext i4 %v2e0 to i32
1288  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1289
1290  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1291  %cv1e1 = sext i4 %v1e1 to i32
1292  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1293  %cv2e1 = sext i4 %v2e1 to i32
1294  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1295
1296  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1297  %cv1e2 = sext i4 %v1e2 to i32
1298  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1299  %cv2e2 = sext i4 %v2e2 to i32
1300  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1301
1302  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1303  %cv1e3 = sext i4 %v1e3 to i32
1304  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1305  %cv2e3 = sext i4 %v2e3 to i32
1306  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1307
1308  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1309  %cv1e4 = sext i4 %v1e4 to i32
1310  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1311  %cv2e4 = sext i4 %v2e4 to i32
1312  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1313
1314  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1315  %cv1e5 = sext i4 %v1e5 to i32
1316  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1317  %cv2e5 = sext i4 %v2e5 to i32
1318  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1319
1320  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1321  %cv1e6 = sext i4 %v1e6 to i32
1322  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1323  %cv2e6 = sext i4 %v2e6 to i32
1324  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1325
1326  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1327  %cv1e7 = sext i4 %v1e7 to i32
1328  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1329  %cv2e7 = sext i4 %v2e7 to i32
1330  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1331
1332  %acc = load i32, i32 addrspace(1)* %dst, align 4
1333  %add =  add i32  %mul0, %acc
1334  %add1 = add i32 %mul0, %add
1335  %add2 = add i32 %add1, %mul1
1336  %add3 = add i32 %add2, %mul2
1337  %add4 = add i32 %add3, %mul3
1338  %add5 = add i32 %add4, %mul4
1339  %add6 = add i32 %add5, %mul5
1340  %add7 = add i32 %add6, %mul6
1341  %add8 = add i32 %add7, %mul7
1342
1343  %res = add i32 %add, %add8
1344  store i32 %res, i32 addrspace(1)* %dst, align 4
1345  ret void
1346}
1347
1348; TODO: Support this pattern.
1349define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1350; GFX7-LABEL: idot8_acc32_vecMul:
1351; GFX7:       ; %bb.0: ; %entry
1352; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1353; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1354; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1355; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1356; GFX7-NEXT:    s_mov_b32 s26, -1
1357; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1359; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1360; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
1361; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
1362; GFX7-NEXT:    s_add_u32 s24, s24, s3
1363; GFX7-NEXT:    s_addc_u32 s25, s25, 0
1364; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1365; GFX7-NEXT:    s_ashr_i32 s13, s5, 28
1366; GFX7-NEXT:    s_bfe_i32 s14, s5, 0x40018
1367; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40014
1368; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40010
1369; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x4000c
1370; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x40008
1371; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x40004
1372; GFX7-NEXT:    s_bfe_i32 s5, s5, 0x40000
1373; GFX7-NEXT:    s_ashr_i32 s6, s4, 28
1374; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x40018
1375; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x40014
1376; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40010
1377; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x4000c
1378; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40008
1379; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x40004
1380; GFX7-NEXT:    s_bfe_i32 s4, s4, 0x40000
1381; GFX7-NEXT:    v_mov_b32_e32 v0, s5
1382; GFX7-NEXT:    v_mov_b32_e32 v1, s20
1383; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
1384; GFX7-NEXT:    v_mov_b32_e32 v1, s19
1385; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
1386; GFX7-NEXT:    v_mov_b32_e32 v1, s18
1387; GFX7-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
1388; GFX7-NEXT:    v_mov_b32_e32 v1, s17
1389; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
1390; GFX7-NEXT:    v_mov_b32_e32 v1, s16
1391; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
1392; GFX7-NEXT:    v_mov_b32_e32 v1, s15
1393; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
1394; GFX7-NEXT:    v_mov_b32_e32 v1, s14
1395; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
1396; GFX7-NEXT:    v_mov_b32_e32 v1, s13
1397; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1398; GFX7-NEXT:    s_mov_b32 s2, -1
1399; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
1400; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1401; GFX7-NEXT:    s_endpgm
1402;
1403; GFX8-LABEL: idot8_acc32_vecMul:
1404; GFX8:       ; %bb.0: ; %entry
1405; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1406; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1407; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1408; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1409; GFX8-NEXT:    s_mov_b32 s22, -1
1410; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
1411; GFX8-NEXT:    s_add_u32 s20, s20, s3
1412; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1413; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
1414; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
1415; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
1416; GFX8-NEXT:    s_addc_u32 s21, s21, 0
1417; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1418; GFX8-NEXT:    s_ashr_i32 s4, s2, 28
1419; GFX8-NEXT:    s_ashr_i32 s11, s3, 28
1420; GFX8-NEXT:    s_bfe_i32 s12, s3, 0x40018
1421; GFX8-NEXT:    s_bfe_i32 s13, s3, 0x40014
1422; GFX8-NEXT:    s_bfe_i32 s14, s3, 0x40010
1423; GFX8-NEXT:    s_bfe_i32 s15, s3, 0x4000c
1424; GFX8-NEXT:    s_bfe_i32 s16, s3, 0x40008
1425; GFX8-NEXT:    s_bfe_i32 s17, s3, 0x40004
1426; GFX8-NEXT:    s_bfe_i32 s3, s3, 0x40000
1427; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40018
1428; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x40014
1429; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x40010
1430; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x4000c
1431; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x40008
1432; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x40004
1433; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x40000
1434; GFX8-NEXT:    v_mov_b32_e32 v0, s3
1435; GFX8-NEXT:    v_mov_b32_e32 v1, s18
1436; GFX8-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
1437; GFX8-NEXT:    v_mov_b32_e32 v1, s17
1438; GFX8-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
1439; GFX8-NEXT:    v_mov_b32_e32 v1, s16
1440; GFX8-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
1441; GFX8-NEXT:    v_mov_b32_e32 v1, s15
1442; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
1443; GFX8-NEXT:    v_mov_b32_e32 v1, s14
1444; GFX8-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
1445; GFX8-NEXT:    v_mov_b32_e32 v1, s13
1446; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
1447; GFX8-NEXT:    v_mov_b32_e32 v1, s12
1448; GFX8-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
1449; GFX8-NEXT:    v_mov_b32_e32 v1, s11
1450; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
1451; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1452; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1453; GFX8-NEXT:    flat_store_dword v[0:1], v2
1454; GFX8-NEXT:    s_endpgm
1455;
1456; GFX9-LABEL: idot8_acc32_vecMul:
1457; GFX9:       ; %bb.0: ; %entry
1458; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1459; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1460; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1461; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1462; GFX9-NEXT:    s_mov_b32 s22, -1
1463; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
1464; GFX9-NEXT:    s_add_u32 s20, s20, s3
1465; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1466; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
1467; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
1468; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
1469; GFX9-NEXT:    s_addc_u32 s21, s21, 0
1470; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1471; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1472; GFX9-NEXT:    s_ashr_i32 s4, s2, 28
1473; GFX9-NEXT:    s_ashr_i32 s11, s3, 28
1474; GFX9-NEXT:    s_bfe_i32 s12, s3, 0x40018
1475; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40014
1476; GFX9-NEXT:    s_bfe_i32 s14, s3, 0x40010
1477; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x4000c
1478; GFX9-NEXT:    s_bfe_i32 s16, s3, 0x40008
1479; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40004
1480; GFX9-NEXT:    s_bfe_i32 s3, s3, 0x40000
1481; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40018
1482; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40014
1483; GFX9-NEXT:    s_bfe_i32 s7, s2, 0x40010
1484; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x4000c
1485; GFX9-NEXT:    s_bfe_i32 s9, s2, 0x40008
1486; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x40004
1487; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x40000
1488; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1489; GFX9-NEXT:    v_mov_b32_e32 v2, s18
1490; GFX9-NEXT:    v_mad_i32_i24 v1, s2, v1, v2
1491; GFX9-NEXT:    v_mov_b32_e32 v2, s17
1492; GFX9-NEXT:    v_mad_i32_i24 v1, s10, v2, v1
1493; GFX9-NEXT:    v_mov_b32_e32 v2, s16
1494; GFX9-NEXT:    v_mad_i32_i24 v1, s9, v2, v1
1495; GFX9-NEXT:    v_mov_b32_e32 v2, s15
1496; GFX9-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
1497; GFX9-NEXT:    v_mov_b32_e32 v2, s14
1498; GFX9-NEXT:    v_mad_i32_i24 v1, s7, v2, v1
1499; GFX9-NEXT:    v_mov_b32_e32 v2, s13
1500; GFX9-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
1501; GFX9-NEXT:    v_mov_b32_e32 v2, s12
1502; GFX9-NEXT:    v_mad_i32_i24 v1, s5, v2, v1
1503; GFX9-NEXT:    v_mov_b32_e32 v2, s11
1504; GFX9-NEXT:    v_mad_i32_i24 v1, s4, v2, v1
1505; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1506; GFX9-NEXT:    s_endpgm
1507;
1508; GFX9-DL-LABEL: idot8_acc32_vecMul:
1509; GFX9-DL:       ; %bb.0: ; %entry
1510; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1511; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1512; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1513; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1514; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1515; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1516; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1517; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1518; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
1519; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
1520; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
1521; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1522; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1523; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1524; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
1525; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
1526; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, s4, v1, v2
1527; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1528; GFX9-DL-NEXT:    s_endpgm
1529;
1530; GFX10-DL-LABEL: idot8_acc32_vecMul:
1531; GFX10-DL:       ; %bb.0: ; %entry
1532; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1533; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1534; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1535; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1536; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1537; GFX10-DL-NEXT:    s_clause 0x1
1538; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1539; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1540; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1541; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1542; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1543; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1544; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1545; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1546; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1547; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1548; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
1549; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
1550; GFX10-DL-NEXT:    s_endpgm
1551                                              <8 x i4> addrspace(1)* %src2,
1552                                              i32 addrspace(1)* nocapture %dst) {
1553entry:
1554  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1555  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1556
1557  %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
1558  %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
1559
1560  %mul = mul <8 x i32> %cvec1, %cvec2
1561  %mul0 = extractelement <8 x i32> %mul, i64 0
1562  %mul1 = extractelement <8 x i32> %mul, i64 1
1563  %mul2 = extractelement <8 x i32> %mul, i64 2
1564  %mul3 = extractelement <8 x i32> %mul, i64 3
1565  %mul4 = extractelement <8 x i32> %mul, i64 4
1566  %mul5 = extractelement <8 x i32> %mul, i64 5
1567  %mul6 = extractelement <8 x i32> %mul, i64 6
1568  %mul7 = extractelement <8 x i32> %mul, i64 7
1569
1570  %acc = load i32, i32 addrspace(1)* %dst, align 4
1571  %add1 = add i32 %mul0, %acc
1572  %add2 = add i32 %add1, %mul1
1573  %add3 = add i32 %add2, %mul2
1574  %add4 = add i32 %add3, %mul3
1575  %add5 = add i32 %add4, %mul4
1576  %add6 = add i32 %add5, %mul5
1577  %add7 = add i32 %add6, %mul6
1578  %add8 = add i32 %add7, %mul7
1579
1580  store i32 %add8, i32 addrspace(1)* %dst, align 4
1581  ret void
1582}
1583
1584; TODO: Support this pattern.
1585define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
1586; GFX7-LABEL: idot8_acc16_vecMul:
1587; GFX7:       ; %bb.0: ; %entry
1588; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1589; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1590; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1591; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1592; GFX7-NEXT:    s_mov_b32 s26, -1
1593; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
1594; GFX7-NEXT:    s_add_u32 s24, s24, s3
1595; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1596; GFX7-NEXT:    s_mov_b32 s2, -1
1597; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1599; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1600; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1601; GFX7-NEXT:    s_addc_u32 s25, s25, 0
1602; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1603; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1604; GFX7-NEXT:    s_ashr_i32 s6, s4, 28
1605; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40018
1606; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40014
1607; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x40010
1608; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x40008
1609; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x4000c
1610; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40000
1611; GFX7-NEXT:    s_ashr_i32 s14, s5, 28
1612; GFX7-NEXT:    s_bfe_i32 s5, s5, 0x40004
1613; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x40018
1614; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40014
1615; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40010
1616; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40008
1617; GFX7-NEXT:    v_mov_b32_e32 v4, s18
1618; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x4000c
1619; GFX7-NEXT:    v_mov_b32_e32 v3, s19
1620; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x40000
1621; GFX7-NEXT:    v_mov_b32_e32 v2, s20
1622; GFX7-NEXT:    s_bfe_i32 s4, s4, 0x40004
1623; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1624; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s4, v1
1625; GFX7-NEXT:    v_mul_i32_i24_e32 v2, s13, v2
1626; GFX7-NEXT:    v_mul_i32_i24_e32 v3, s12, v3
1627; GFX7-NEXT:    v_mul_i32_i24_e32 v4, s11, v4
1628; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1629; GFX7-NEXT:    v_and_b32_e32 v2, s8, v2
1630; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1631; GFX7-NEXT:    v_and_b32_e32 v4, s8, v4
1632; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
1633; GFX7-NEXT:    v_or_b32_e32 v2, v2, v1
1634; GFX7-NEXT:    v_alignbit_b32 v1, v3, v1, 16
1635; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
1636; GFX7-NEXT:    v_mov_b32_e32 v5, s17
1637; GFX7-NEXT:    v_mov_b32_e32 v6, s16
1638; GFX7-NEXT:    v_mov_b32_e32 v7, s15
1639; GFX7-NEXT:    s_waitcnt vmcnt(0)
1640; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1641; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1642; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
1643; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
1644; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v5, v0
1645; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v6, v0
1646; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v7, v0
1647; GFX7-NEXT:    v_mov_b32_e32 v1, s14
1648; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
1649; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1650; GFX7-NEXT:    s_endpgm
1651;
1652; GFX8-LABEL: idot8_acc16_vecMul:
1653; GFX8:       ; %bb.0: ; %entry
1654; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1655; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1656; GFX8-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
1657; GFX8-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
1658; GFX8-NEXT:    s_mov_b32 s18, -1
1659; GFX8-NEXT:    s_mov_b32 s19, 0xe80000
1660; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1661; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1662; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1663; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
1664; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1665; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
1666; GFX8-NEXT:    s_add_u32 s16, s16, s3
1667; GFX8-NEXT:    s_addc_u32 s17, s17, 0
1668; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX8-NEXT:    s_bfe_i32 s8, s0, 0x40000
1670; GFX8-NEXT:    s_bfe_i32 s15, s1, 0x40000
1671; GFX8-NEXT:    s_bfe_i32 s10, s1, 0x40018
1672; GFX8-NEXT:    s_bfe_i32 s11, s1, 0x40014
1673; GFX8-NEXT:    s_bfe_i32 s12, s1, 0x40010
1674; GFX8-NEXT:    s_bfe_i32 s13, s1, 0x4000c
1675; GFX8-NEXT:    s_bfe_i32 s14, s1, 0x40004
1676; GFX8-NEXT:    s_ashr_i32 s9, s1, 28
1677; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x40008
1678; GFX8-NEXT:    v_mov_b32_e32 v4, s15
1679; GFX8-NEXT:    s_ashr_i32 s2, s0, 28
1680; GFX8-NEXT:    s_bfe_i32 s3, s0, 0x40018
1681; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x40014
1682; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x40010
1683; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x4000c
1684; GFX8-NEXT:    s_bfe_i32 s7, s0, 0x40004
1685; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x40008
1686; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1687; GFX8-NEXT:    v_mov_b32_e32 v5, s14
1688; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s0, v3
1689; GFX8-NEXT:    v_mov_b32_e32 v6, s13
1690; GFX8-NEXT:    v_mov_b32_e32 v7, s12
1691; GFX8-NEXT:    v_mov_b32_e32 v8, s11
1692; GFX8-NEXT:    v_mov_b32_e32 v9, s10
1693; GFX8-NEXT:    s_waitcnt vmcnt(0)
1694; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v4, v2
1695; GFX8-NEXT:    v_mad_i32_i24 v2, s7, v5, v2
1696; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1697; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v6, v2
1698; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v7, v2
1699; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v8, v2
1700; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v9, v2
1701; GFX8-NEXT:    v_mov_b32_e32 v3, s9
1702; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
1703; GFX8-NEXT:    flat_store_short v[0:1], v2
1704; GFX8-NEXT:    s_endpgm
1705;
1706; GFX9-LABEL: idot8_acc16_vecMul:
1707; GFX9:       ; %bb.0: ; %entry
1708; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1709; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1710; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1711; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1712; GFX9-NEXT:    s_mov_b32 s22, -1
1713; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1714; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
1715; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
1716; GFX9-NEXT:    s_add_u32 s20, s20, s3
1717; GFX9-NEXT:    s_addc_u32 s21, s21, 0
1718; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
1719; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1720; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x40018
1721; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
1722; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40010
1723; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
1724; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
1725; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
1726; GFX9-NEXT:    s_and_b32 s11, s2, 15
1727; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40004
1728; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
1729; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
1730; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
1731; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
1732; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s5, s8
1733; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
1734; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
1735; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x40018
1736; GFX9-NEXT:    s_lshr_b32 s12, s6, 28
1737; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40010
1738; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
1739; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40008
1740; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
1741; GFX9-NEXT:    s_and_b32 s17, s6, 15
1742; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
1743; GFX9-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
1744; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s17, s6
1745; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
1746; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1747; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
1748; GFX9-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
1749; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
1750; GFX9-NEXT:    global_load_ushort v5, v0, s[0:1]
1751; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s15, s16
1752; GFX9-NEXT:    v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1]
1753; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s13, s14
1754; GFX9-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
1755; GFX9-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
1756; GFX9-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
1757; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
1758; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s7, s12
1759; GFX9-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1760; GFX9-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
1761; GFX9-NEXT:    v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1]
1762; GFX9-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
1763; GFX9-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
1764; GFX9-NEXT:    v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
1765; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
1766; GFX9-NEXT:    s_waitcnt vmcnt(0)
1767; GFX9-NEXT:    v_add_u32_e32 v5, v1, v5
1768; GFX9-NEXT:    v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1769; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1770; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1771; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1772; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1773; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
1774; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1775; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1776; GFX9-NEXT:    s_endpgm
1777;
1778; GFX9-DL-LABEL: idot8_acc16_vecMul:
1779; GFX9-DL:       ; %bb.0: ; %entry
1780; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1781; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1782; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
1783; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
1784; GFX9-DL-NEXT:    s_mov_b32 s22, -1
1785; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1786; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1787; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
1788; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
1789; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
1790; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
1791; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1792; GFX9-DL-NEXT:    s_bfe_u32 s3, s2, 0x40018
1793; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
1794; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40010
1795; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
1796; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
1797; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x4000c
1798; GFX9-DL-NEXT:    s_and_b32 s11, s2, 15
1799; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40004
1800; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
1801; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
1802; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
1803; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
1804; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s5, s8
1805; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
1806; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
1807; GFX9-DL-NEXT:    s_bfe_u32 s7, s6, 0x40018
1808; GFX9-DL-NEXT:    s_lshr_b32 s12, s6, 28
1809; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40010
1810; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40014
1811; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40008
1812; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
1813; GFX9-DL-NEXT:    s_and_b32 s17, s6, 15
1814; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
1815; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
1816; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s17, s6
1817; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
1818; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1819; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
1820; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
1821; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
1822; GFX9-DL-NEXT:    global_load_ushort v5, v0, s[0:1]
1823; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s15, s16
1824; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1]
1825; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s13, s14
1826; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
1827; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
1828; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
1829; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
1830; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s7, s12
1831; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1832; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
1833; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1]
1834; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
1835; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
1836; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
1837; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
1838; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1839; GFX9-DL-NEXT:    v_add_u32_e32 v5, v1, v5
1840; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1841; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1842; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1843; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v3
1844; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1845; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v4
1846; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1847; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
1848; GFX9-DL-NEXT:    s_endpgm
1849;
1850; GFX10-DL-LABEL: idot8_acc16_vecMul:
1851; GFX10-DL:       ; %bb.0: ; %entry
1852; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1853; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1854; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1855; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1856; GFX10-DL-NEXT:    s_mov_b32 s14, -1
1857; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
1858; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
1859; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1860; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
1861; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1862; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
1863; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1864; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1865; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1866; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
1867; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 28
1868; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
1869; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x40014
1870; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x40008
1871; GFX10-DL-NEXT:    s_bfe_u32 s9, s0, 0x4000c
1872; GFX10-DL-NEXT:    s_and_b32 s10, s0, 15
1873; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x40004
1874; GFX10-DL-NEXT:    s_and_b32 s11, s1, 15
1875; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s10, s0
1876; GFX10-DL-NEXT:    s_bfe_u32 s10, s1, 0x40004
1877; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1]
1878; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s11, s10
1879; GFX10-DL-NEXT:    s_bfe_u32 s11, s1, 0x4000c
1880; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
1881; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40008
1882; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
1883; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
1884; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s11
1885; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1886; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1]
1887; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
1888; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40010
1889; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40014
1890; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v3
1891; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1]
1892; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
1893; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
1894; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s8, s0
1895; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1]
1896; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
1897; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
1898; GFX10-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
1899; GFX10-DL-NEXT:    s_lshr_b32 s0, s1, 28
1900; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s2, s3
1901; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
1902; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s10, s0
1903; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
1904; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1905; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
1906; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1907; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1]
1908; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1]
1909; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1910; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
1911; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
1912; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1913; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1]
1914; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1915; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
1916; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1917; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v3
1918; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1919; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
1920; GFX10-DL-NEXT:    s_endpgm
1921                                              <8 x i4> addrspace(1)* %src2,
1922                                              i16 addrspace(1)* nocapture %dst) {
1923entry:
1924  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1925  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1926
1927  %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
1928  %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
1929
1930  %mul = mul <8 x i16> %cvec1, %cvec2
1931  %mul0 = extractelement <8 x i16> %mul, i64 0
1932  %mul1 = extractelement <8 x i16> %mul, i64 1
1933  %mul2 = extractelement <8 x i16> %mul, i64 2
1934  %mul3 = extractelement <8 x i16> %mul, i64 3
1935  %mul4 = extractelement <8 x i16> %mul, i64 4
1936  %mul5 = extractelement <8 x i16> %mul, i64 5
1937  %mul6 = extractelement <8 x i16> %mul, i64 6
1938  %mul7 = extractelement <8 x i16> %mul, i64 7
1939
1940  %acc = load i16, i16 addrspace(1)* %dst, align 4
1941  %add1 = add i16 %mul0, %acc
1942  %add2 = add i16 %add1, %mul1
1943  %add3 = add i16 %add2, %mul2
1944  %add4 = add i16 %add3, %mul3
1945  %add5 = add i16 %add4, %mul4
1946  %add6 = add i16 %add5, %mul5
1947  %add7 = add i16 %add6, %mul6
1948  %add8 = add i16 %add7, %mul7
1949
1950  store i16 %add8, i16 addrspace(1)* %dst, align 4
1951  ret void
1952}
1953
1954; TODO: Support this pattern.
1955define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
1956; GFX7-LABEL: idot8_acc8_vecMul:
1957; GFX7:       ; %bb.0: ; %entry
1958; GFX7-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1959; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1960; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1961; GFX7-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1962; GFX7-NEXT:    s_mov_b32 s26, -1
1963; GFX7-NEXT:    s_mov_b32 s27, 0xe8f000
1964; GFX7-NEXT:    s_add_u32 s24, s24, s3
1965; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1966; GFX7-NEXT:    s_mov_b32 s2, -1
1967; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
1969; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1970; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1971; GFX7-NEXT:    s_addc_u32 s25, s25, 0
1972; GFX7-NEXT:    s_movk_i32 s8, 0xff
1973; GFX7-NEXT:    s_mov_b32 s9, 0xffff
1974; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1975; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
1976; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40000
1977; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40004
1978; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x40008
1979; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x4000c
1980; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x40010
1981; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40014
1982; GFX7-NEXT:    s_bfe_i32 s21, s5, 0x40018
1983; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
1984; GFX7-NEXT:    v_mov_b32_e32 v8, s15
1985; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x40004
1986; GFX7-NEXT:    v_mov_b32_e32 v7, s16
1987; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40008
1988; GFX7-NEXT:    v_mov_b32_e32 v6, s17
1989; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x4000c
1990; GFX7-NEXT:    v_mov_b32_e32 v5, s18
1991; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x40010
1992; GFX7-NEXT:    v_mov_b32_e32 v4, s19
1993; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x40014
1994; GFX7-NEXT:    v_mov_b32_e32 v3, s20
1995; GFX7-NEXT:    s_bfe_i32 s14, s4, 0x40018
1996; GFX7-NEXT:    v_mov_b32_e32 v2, s21
1997; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
1998; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1999; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s4, v1
2000; GFX7-NEXT:    v_mul_i32_i24_e32 v2, s14, v2
2001; GFX7-NEXT:    v_mul_i32_i24_e32 v3, s13, v3
2002; GFX7-NEXT:    v_mul_i32_i24_e32 v9, s12, v4
2003; GFX7-NEXT:    v_mul_i32_i24_e32 v5, s11, v5
2004; GFX7-NEXT:    v_mul_i32_i24_e32 v6, s10, v6
2005; GFX7-NEXT:    v_mul_i32_i24_e32 v7, s7, v7
2006; GFX7-NEXT:    v_mul_i32_i24_e32 v8, s6, v8
2007; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
2008; GFX7-NEXT:    v_and_b32_e32 v2, s8, v2
2009; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
2010; GFX7-NEXT:    v_and_b32_e32 v9, s8, v9
2011; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
2012; GFX7-NEXT:    v_and_b32_e32 v6, s8, v6
2013; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
2014; GFX7-NEXT:    v_and_b32_e32 v8, s8, v8
2015; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
2016; GFX7-NEXT:    v_or_b32_e32 v2, v9, v3
2017; GFX7-NEXT:    v_or_b32_e32 v3, v6, v5
2018; GFX7-NEXT:    v_or_b32_e32 v5, v8, v7
2019; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2020; GFX7-NEXT:    v_and_b32_e32 v2, s9, v2
2021; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2022; GFX7-NEXT:    v_and_b32_e32 v5, s9, v5
2023; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
2024; GFX7-NEXT:    v_or_b32_e32 v2, v5, v3
2025; GFX7-NEXT:    v_alignbit_b32 v3, v1, v2, 8
2026; GFX7-NEXT:    v_alignbit_b32 v5, v1, v2, 16
2027; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
2028; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
2029; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
2030; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
2031; GFX7-NEXT:    s_waitcnt vmcnt(0)
2032; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2033; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
2034; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
2035; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
2036; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v4, v0
2037; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
2038; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
2039; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2040; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2041; GFX7-NEXT:    s_endpgm
2042;
2043; GFX8-LABEL: idot8_acc8_vecMul:
2044; GFX8:       ; %bb.0: ; %entry
2045; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2046; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2047; GFX8-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2048; GFX8-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2049; GFX8-NEXT:    s_mov_b32 s22, -1
2050; GFX8-NEXT:    s_mov_b32 s23, 0xe80000
2051; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2052; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2053; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2054; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
2055; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
2056; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
2057; GFX8-NEXT:    s_add_u32 s20, s20, s3
2058; GFX8-NEXT:    s_addc_u32 s21, s21, 0
2059; GFX8-NEXT:    s_mov_b32 s0, 0xffff
2060; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2061; GFX8-NEXT:    s_bfe_i32 s7, s1, 0x40004
2062; GFX8-NEXT:    s_bfe_i32 s9, s1, 0x4000c
2063; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40004
2064; GFX8-NEXT:    s_bfe_i32 s15, s2, 0x40000
2065; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x4000c
2066; GFX8-NEXT:    s_bfe_i32 s3, s1, 0x40014
2067; GFX8-NEXT:    s_ashr_i32 s5, s1, 28
2068; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x40014
2069; GFX8-NEXT:    s_bfe_i32 s11, s2, 0x40010
2070; GFX8-NEXT:    s_ashr_i32 s12, s2, 28
2071; GFX8-NEXT:    s_bfe_i32 s13, s2, 0x40018
2072; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x40008
2073; GFX8-NEXT:    s_bfe_i32 s8, s1, 0x40000
2074; GFX8-NEXT:    v_mov_b32_e32 v4, s16
2075; GFX8-NEXT:    v_mov_b32_e32 v5, s9
2076; GFX8-NEXT:    v_mov_b32_e32 v6, s15
2077; GFX8-NEXT:    v_mov_b32_e32 v7, s14
2078; GFX8-NEXT:    v_mov_b32_e32 v8, s7
2079; GFX8-NEXT:    v_mul_i32_i24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2080; GFX8-NEXT:    v_mul_i32_i24_e32 v5, s8, v6
2081; GFX8-NEXT:    v_mul_i32_i24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2082; GFX8-NEXT:    s_bfe_i32 s4, s1, 0x40010
2083; GFX8-NEXT:    s_bfe_i32 s6, s1, 0x40018
2084; GFX8-NEXT:    v_mov_b32_e32 v9, s13
2085; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x40008
2086; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2087; GFX8-NEXT:    v_mov_b32_e32 v10, s12
2088; GFX8-NEXT:    v_mov_b32_e32 v11, s5
2089; GFX8-NEXT:    v_mov_b32_e32 v12, s11
2090; GFX8-NEXT:    v_mov_b32_e32 v13, s10
2091; GFX8-NEXT:    v_mov_b32_e32 v14, s3
2092; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s1, v3
2093; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2094; GFX8-NEXT:    v_mul_i32_i24_e32 v7, s6, v9
2095; GFX8-NEXT:    v_mul_i32_i24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2096; GFX8-NEXT:    v_mul_i32_i24_e32 v9, s4, v12
2097; GFX8-NEXT:    v_mul_i32_i24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2098; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
2099; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2100; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2101; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2102; GFX8-NEXT:    v_and_b32_e32 v4, s0, v9
2103; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
2104; GFX8-NEXT:    v_or_b32_e32 v6, v4, v7
2105; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
2106; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v6
2107; GFX8-NEXT:    s_waitcnt vmcnt(0)
2108; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
2109; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
2110; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
2111; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2112; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2113; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2114; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2115; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2116; GFX8-NEXT:    flat_store_byte v[0:1], v2
2117; GFX8-NEXT:    s_endpgm
2118;
2119; GFX9-LABEL: idot8_acc8_vecMul:
2120; GFX9:       ; %bb.0: ; %entry
2121; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2122; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2123; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2124; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2125; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2126; GFX9-NEXT:    s_mov_b32 s22, -1
2127; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2128; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
2129; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
2130; GFX9-NEXT:    s_add_u32 s20, s20, s3
2131; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
2132; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
2133; GFX9-NEXT:    s_addc_u32 s21, s21, 0
2134; GFX9-NEXT:    s_mov_b32 s2, 0xffff
2135; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2136; GFX9-NEXT:    s_lshr_b32 s9, s3, 4
2137; GFX9-NEXT:    s_lshr_b32 s16, s4, 4
2138; GFX9-NEXT:    v_lshlrev_b16_e64 v2, 12, s3
2139; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
2140; GFX9-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
2141; GFX9-NEXT:    v_lshlrev_b16_e64 v13, 12, s16
2142; GFX9-NEXT:    s_lshr_b32 s10, s3, 12
2143; GFX9-NEXT:    s_lshr_b32 s11, s3, 8
2144; GFX9-NEXT:    s_lshr_b32 s17, s4, 12
2145; GFX9-NEXT:    s_lshr_b32 s18, s4, 8
2146; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s11
2147; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
2148; GFX9-NEXT:    v_lshlrev_b16_e64 v11, 12, s18
2149; GFX9-NEXT:    v_lshlrev_b16_e64 v12, 12, s17
2150; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2151; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
2152; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2153; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2154; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
2155; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2156; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2157; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2158; GFX9-NEXT:    v_mul_lo_u16_e32 v2, v2, v3
2159; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2160; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2161; GFX9-NEXT:    s_lshr_b32 s5, s3, 20
2162; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
2163; GFX9-NEXT:    s_lshr_b32 s12, s4, 20
2164; GFX9-NEXT:    s_lshr_b32 s13, s4, 16
2165; GFX9-NEXT:    v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2166; GFX9-NEXT:    v_mul_lo_u16_e32 v4, v4, v11
2167; GFX9-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
2168; GFX9-NEXT:    v_lshlrev_b16_e64 v10, 12, s5
2169; GFX9-NEXT:    v_lshlrev_b16_e64 v16, 12, s13
2170; GFX9-NEXT:    v_lshlrev_b16_e64 v17, 12, s12
2171; GFX9-NEXT:    s_lshr_b32 s7, s3, 28
2172; GFX9-NEXT:    s_lshr_b32 s8, s3, 24
2173; GFX9-NEXT:    s_lshr_b32 s14, s4, 28
2174; GFX9-NEXT:    s_lshr_b32 s15, s4, 24
2175; GFX9-NEXT:    v_and_b32_e32 v2, s2, v2
2176; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2177; GFX9-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
2178; GFX9-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
2179; GFX9-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
2180; GFX9-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
2181; GFX9-NEXT:    v_or_b32_e32 v4, v2, v4
2182; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2183; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
2184; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2185; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2186; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2187; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2188; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2189; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2190; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2191; GFX9-NEXT:    v_mul_lo_u16_e32 v9, v9, v16
2192; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v4
2193; GFX9-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2194; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2195; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
2196; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2197; GFX9-NEXT:    v_and_b32_e32 v3, s2, v3
2198; GFX9-NEXT:    v_or_b32_e32 v5, v3, v7
2199; GFX9-NEXT:    s_waitcnt vmcnt(0)
2200; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
2201; GFX9-NEXT:    v_add_u32_e32 v1, v1, v6
2202; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2203; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2204; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
2205; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v5
2206; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
2207; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2208; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2209; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
2210; GFX9-NEXT:    s_endpgm
2211;
2212; GFX9-DL-LABEL: idot8_acc8_vecMul:
2213; GFX9-DL:       ; %bb.0: ; %entry
2214; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2215; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2216; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2217; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2218; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2219; GFX9-DL-NEXT:    s_mov_b32 s22, -1
2220; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2221; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
2222; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
2223; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
2224; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
2225; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
2226; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
2227; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
2228; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2229; GFX9-DL-NEXT:    s_lshr_b32 s9, s3, 4
2230; GFX9-DL-NEXT:    s_lshr_b32 s16, s4, 4
2231; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s3
2232; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
2233; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
2234; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s16
2235; GFX9-DL-NEXT:    s_lshr_b32 s10, s3, 12
2236; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 8
2237; GFX9-DL-NEXT:    s_lshr_b32 s17, s4, 12
2238; GFX9-DL-NEXT:    s_lshr_b32 s18, s4, 8
2239; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s11
2240; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
2241; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s18
2242; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s17
2243; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2244; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
2245; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2246; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2247; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
2248; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2249; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2250; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2251; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v2, v2, v3
2252; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2253; GFX9-DL-NEXT:    v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2254; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 20
2255; GFX9-DL-NEXT:    s_lshr_b32 s6, s3, 16
2256; GFX9-DL-NEXT:    s_lshr_b32 s12, s4, 20
2257; GFX9-DL-NEXT:    s_lshr_b32 s13, s4, 16
2258; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2259; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v4, v4, v11
2260; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
2261; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s5
2262; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s13
2263; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v17, 12, s12
2264; GFX9-DL-NEXT:    s_lshr_b32 s7, s3, 28
2265; GFX9-DL-NEXT:    s_lshr_b32 s8, s3, 24
2266; GFX9-DL-NEXT:    s_lshr_b32 s14, s4, 28
2267; GFX9-DL-NEXT:    s_lshr_b32 s15, s4, 24
2268; GFX9-DL-NEXT:    v_and_b32_e32 v2, s2, v2
2269; GFX9-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2270; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
2271; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
2272; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
2273; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
2274; GFX9-DL-NEXT:    v_or_b32_e32 v4, v2, v4
2275; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2276; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
2277; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2278; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2279; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2280; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2281; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2282; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2283; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2284; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, v9, v16
2285; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v4
2286; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2287; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2288; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
2289; GFX9-DL-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2290; GFX9-DL-NEXT:    v_and_b32_e32 v3, s2, v3
2291; GFX9-DL-NEXT:    v_or_b32_e32 v5, v3, v7
2292; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2293; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
2294; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v6
2295; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2296; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2297; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v3
2298; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v5
2299; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
2300; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2301; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2302; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
2303; GFX9-DL-NEXT:    s_endpgm
2304;
2305; GFX10-DL-LABEL: idot8_acc8_vecMul:
2306; GFX10-DL:       ; %bb.0: ; %entry
2307; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2308; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2309; GFX10-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
2310; GFX10-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
2311; GFX10-DL-NEXT:    s_mov_b32 s22, -1
2312; GFX10-DL-NEXT:    s_mov_b32 s23, 0x31c16000
2313; GFX10-DL-NEXT:    s_add_u32 s20, s20, s3
2314; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2315; GFX10-DL-NEXT:    s_addc_u32 s21, s21, 0
2316; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
2318; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2319; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2320; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
2321; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2322; GFX10-DL-NEXT:    s_lshr_b32 s9, s0, 4
2323; GFX10-DL-NEXT:    s_lshr_b32 s16, s1, 4
2324; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
2325; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
2326; GFX10-DL-NEXT:    s_lshr_b32 s10, s0, 12
2327; GFX10-DL-NEXT:    s_lshr_b32 s17, s1, 12
2328; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s0
2329; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s1
2330; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s17
2331; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
2332; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
2333; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
2334; GFX10-DL-NEXT:    s_lshr_b32 s11, s0, 8
2335; GFX10-DL-NEXT:    s_lshr_b32 s18, s1, 8
2336; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s11
2337; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s18
2338; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
2339; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
2340; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v5
2341; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, v6, v12
2342; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v13, 12, v13
2343; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
2344; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v11
2345; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v2, v2, v3
2346; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v6
2347; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v19, v13
2348; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 20
2349; GFX10-DL-NEXT:    s_lshr_b32 s6, s0, 16
2350; GFX10-DL-NEXT:    s_lshr_b32 s7, s0, 28
2351; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 24
2352; GFX10-DL-NEXT:    s_lshr_b32 s12, s1, 20
2353; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2354; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
2355; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
2356; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
2357; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s3
2358; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s12
2359; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v11
2360; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
2361; GFX10-DL-NEXT:    s_lshr_b32 s13, s1, 16
2362; GFX10-DL-NEXT:    s_lshr_b32 s14, s1, 28
2363; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s13
2364; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v7
2365; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v8
2366; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v9
2367; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2368; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
2369; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
2370; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v10
2371; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v12
2372; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 24
2373; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
2374; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v10, 12, v15
2375; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
2376; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v3
2377; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v9
2378; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v15, v8, v6
2379; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v10
2380; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v14
2381; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
2382; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2383; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
2384; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 8, v4
2385; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v5, v11
2386; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v7
2387; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v8
2388; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2389; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2390; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2391; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
2392; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2393; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v4
2394; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2395; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
2396; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2397; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2398; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2399; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
2400; GFX10-DL-NEXT:    s_endpgm
2401                                             <8 x i4> addrspace(1)* %src2,
2402                                             i8 addrspace(1)* nocapture %dst) {
2403entry:
2404  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
2405  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
2406
2407  %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
2408  %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
2409
2410  %mul = mul <8 x i8> %cvec1, %cvec2
2411  %mul0 = extractelement <8 x i8> %mul, i64 0
2412  %mul1 = extractelement <8 x i8> %mul, i64 1
2413  %mul2 = extractelement <8 x i8> %mul, i64 2
2414  %mul3 = extractelement <8 x i8> %mul, i64 3
2415  %mul4 = extractelement <8 x i8> %mul, i64 4
2416  %mul5 = extractelement <8 x i8> %mul, i64 5
2417  %mul6 = extractelement <8 x i8> %mul, i64 6
2418  %mul7 = extractelement <8 x i8> %mul, i64 7
2419
2420  %acc = load i8, i8 addrspace(1)* %dst, align 4
2421  %add1 = add i8 %mul0, %acc
2422  %add2 = add i8 %add1, %mul1
2423  %add3 = add i8 %add2, %mul2
2424  %add4 = add i8 %add3, %mul3
2425  %add5 = add i8 %add4, %mul4
2426  %add6 = add i8 %add5, %mul5
2427  %add7 = add i8 %add6, %mul6
2428  %add8 = add i8 %add7, %mul7
2429
2430  store i8 %add8, i8 addrspace(1)* %dst, align 4
2431  ret void
2432}
2433