1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
10; GFX7-LABEL: idot4_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
14; GFX7-NEXT:    s_mov_b32 s3, 0xf000
15; GFX7-NEXT:    s_mov_b32 s2, -1
16; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
18; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
19; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    s_sext_i32_i8 s6, s4
22; GFX7-NEXT:    s_sext_i32_i8 s7, s5
23; GFX7-NEXT:    s_bfe_i32 s9, s5, 0x80008
24; GFX7-NEXT:    v_mov_b32_e32 v0, s7
25; GFX7-NEXT:    v_mov_b32_e32 v1, s12
26; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80010
27; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v0, v1
28; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
29; GFX7-NEXT:    v_mov_b32_e32 v1, s9
30; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x80010
31; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
32; GFX7-NEXT:    v_mov_b32_e32 v1, s11
33; GFX7-NEXT:    s_ashr_i32 s5, s5, 24
34; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
35; GFX7-NEXT:    s_ashr_i32 s4, s4, 24
36; GFX7-NEXT:    v_mov_b32_e32 v1, s5
37; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v1, v0
38; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
39; GFX7-NEXT:    s_endpgm
40;
41; GFX8-LABEL: idot4_acc32:
42; GFX8:       ; %bb.0: ; %entry
43; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
44; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
45; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
47; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
48; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
49; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX8-NEXT:    s_sext_i32_i8 s4, s2
51; GFX8-NEXT:    s_sext_i32_i8 s5, s3
52; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x80008
53; GFX8-NEXT:    v_mov_b32_e32 v0, s5
54; GFX8-NEXT:    v_mov_b32_e32 v1, s10
55; GFX8-NEXT:    s_bfe_i32 s9, s3, 0x80010
56; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
57; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x80008
58; GFX8-NEXT:    v_mov_b32_e32 v1, s7
59; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x80010
60; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
61; GFX8-NEXT:    v_mov_b32_e32 v1, s9
62; GFX8-NEXT:    s_ashr_i32 s3, s3, 24
63; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
64; GFX8-NEXT:    s_ashr_i32 s2, s2, 24
65; GFX8-NEXT:    v_mov_b32_e32 v1, s3
66; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
67; GFX8-NEXT:    v_mov_b32_e32 v0, s0
68; GFX8-NEXT:    v_mov_b32_e32 v1, s1
69; GFX8-NEXT:    flat_store_dword v[0:1], v2
70; GFX8-NEXT:    s_endpgm
71;
72; GFX9-NODL-LABEL: idot4_acc32:
73; GFX9-NODL:       ; %bb.0: ; %entry
74; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
75; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
76; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
77; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
79; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
80; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
81; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
83; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
84; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80008
85; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
86; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
87; GFX9-NODL-NEXT:    s_bfe_i32 s9, s3, 0x80010
88; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
89; GFX9-NODL-NEXT:    s_bfe_i32 s6, s2, 0x80008
90; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
91; GFX9-NODL-NEXT:    s_bfe_i32 s8, s2, 0x80010
92; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
93; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
94; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
95; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
96; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
97; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
98; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
99; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
100; GFX9-NODL-NEXT:    s_endpgm
101;
102; GFX9-DL-LABEL: idot4_acc32:
103; GFX9-DL:       ; %bb.0: ; %entry
104; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
105; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
106; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
107; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
109; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
110; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
111; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
112; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
113; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
114; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, s4, v1, v2
115; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
116; GFX9-DL-NEXT:    s_endpgm
117;
118; GFX10-DL-LABEL: idot4_acc32:
119; GFX10-DL:       ; %bb.0: ; %entry
120; GFX10-DL-NEXT:    s_clause 0x1
121; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
122; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
123; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
124; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
126; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
127; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
128; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
130; GFX10-DL-NEXT:    v_dot4_i32_i8 v0, s0, s1, v0
131; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
132; GFX10-DL-NEXT:    s_endpgm
133                                       <4 x i8> addrspace(1)* %src2,
134                                       i32 addrspace(1)* nocapture %dst) {
135entry:
136  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
137  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
138
139  %v1e0 = extractelement <4 x i8> %vec1, i64 0
140  %cv1e0 = sext i8 %v1e0 to i32
141  %v2e0 = extractelement <4 x i8> %vec2, i64 0
142  %cv2e0 = sext i8 %v2e0 to i32
143  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
144
145  %v1e1 = extractelement <4 x i8> %vec1, i64 1
146  %cv1e1 = sext i8 %v1e1 to i32
147  %v2e1 = extractelement <4 x i8> %vec2, i64 1
148  %cv2e1 = sext i8 %v2e1 to i32
149  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
150
151  %v1e2 = extractelement <4 x i8> %vec1, i64 2
152  %cv1e2 = sext i8 %v1e2 to i32
153  %v2e2 = extractelement <4 x i8> %vec2, i64 2
154  %cv2e2 = sext i8 %v2e2 to i32
155  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
156
157  %v1e3 = extractelement <4 x i8> %vec1, i64 3
158  %cv1e3 = sext i8 %v1e3 to i32
159  %v2e3 = extractelement <4 x i8> %vec2, i64 3
160  %cv2e3 = sext i8 %v2e3 to i32
161  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
162
163  %acc = load i32, i32 addrspace(1)* %dst, align 4
164  %add1 = add i32 %mul1, %acc
165  %add2 = add i32 %add1, %mul2
166  %add3 = add i32 %add2, %mul3
167  %add4 = add i32 %add3, %mul4
168  store i32 %add4, i32 addrspace(1)* %dst, align 4
169  ret void
170}
171
172; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should
173; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern.
174define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
175; GFX7-LABEL: idot4_acc16:
176; GFX7:       ; %bb.0: ; %entry
177; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
178; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
179; GFX7-NEXT:    s_mov_b32 s3, 0xf000
180; GFX7-NEXT:    s_mov_b32 s2, -1
181; GFX7-NEXT:    s_mov_b32 s8, 0xffff
182; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
184; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
185; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
186; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
187; GFX7-NEXT:    s_sext_i32_i8 s6, s4
188; GFX7-NEXT:    s_sext_i32_i8 s7, s5
189; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80008
190; GFX7-NEXT:    s_and_b32 s7, s7, s8
191; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x80010
192; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x80008
193; GFX7-NEXT:    s_and_b32 s10, s10, s8
194; GFX7-NEXT:    s_and_b32 s6, s6, s8
195; GFX7-NEXT:    v_mov_b32_e32 v1, s7
196; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x80010
197; GFX7-NEXT:    s_ashr_i32 s5, s5, 24
198; GFX7-NEXT:    s_and_b32 s12, s12, s8
199; GFX7-NEXT:    s_and_b32 s9, s9, s8
200; GFX7-NEXT:    v_mov_b32_e32 v2, s10
201; GFX7-NEXT:    s_ashr_i32 s4, s4, 24
202; GFX7-NEXT:    s_and_b32 s11, s11, s8
203; GFX7-NEXT:    s_and_b32 s5, s5, s8
204; GFX7-NEXT:    v_mov_b32_e32 v3, s12
205; GFX7-NEXT:    s_and_b32 s4, s4, s8
206; GFX7-NEXT:    s_waitcnt vmcnt(0)
207; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
208; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
209; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
210; GFX7-NEXT:    v_mov_b32_e32 v1, s5
211; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
212; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
213; GFX7-NEXT:    s_endpgm
214;
215; GFX8-LABEL: idot4_acc16:
216; GFX8:       ; %bb.0: ; %entry
217; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
218; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
219; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX8-NEXT:    v_mov_b32_e32 v0, s0
221; GFX8-NEXT:    v_mov_b32_e32 v1, s1
222; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
223; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
224; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
225; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX8-NEXT:    s_sext_i32_i8 s2, s0
227; GFX8-NEXT:    s_sext_i32_i8 s3, s1
228; GFX8-NEXT:    s_bfe_i32 s5, s1, 0x80008
229; GFX8-NEXT:    v_mov_b32_e32 v3, s3
230; GFX8-NEXT:    s_bfe_i32 s7, s1, 0x80010
231; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x80008
232; GFX8-NEXT:    v_mov_b32_e32 v4, s5
233; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x80010
234; GFX8-NEXT:    s_ashr_i32 s1, s1, 24
235; GFX8-NEXT:    v_mov_b32_e32 v5, s7
236; GFX8-NEXT:    s_ashr_i32 s0, s0, 24
237; GFX8-NEXT:    s_waitcnt vmcnt(0)
238; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
239; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
240; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v5, v2
241; GFX8-NEXT:    v_mov_b32_e32 v3, s1
242; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
243; GFX8-NEXT:    flat_store_short v[0:1], v2
244; GFX8-NEXT:    s_endpgm
245;
246; GFX9-NODL-LABEL: idot4_acc16:
247; GFX9-NODL:       ; %bb.0: ; %entry
248; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
249; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
250; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
251; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
253; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
254; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
255; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
257; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
258; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80008
259; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
260; GFX9-NODL-NEXT:    s_bfe_i32 s9, s3, 0x80010
261; GFX9-NODL-NEXT:    s_bfe_i32 s6, s2, 0x80008
262; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
263; GFX9-NODL-NEXT:    s_bfe_i32 s8, s2, 0x80010
264; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
265; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s9
266; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
267; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
268; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v2, v1
269; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v3, v1
270; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s8, v4, v1
271; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
272; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
273; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
274; GFX9-NODL-NEXT:    s_endpgm
275;
276; GFX9-DL-LABEL: idot4_acc16:
277; GFX9-DL:       ; %bb.0: ; %entry
278; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
279; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
280; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
281; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
283; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
284; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
285; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
287; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
288; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, s2, v2, v1
289; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
290; GFX9-DL-NEXT:    s_endpgm
291;
292; GFX10-DL-LABEL: idot4_acc16:
293; GFX10-DL:       ; %bb.0: ; %entry
294; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
295; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
296; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
297; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
299; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
300; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
301; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
302; GFX10-DL-NEXT:    v_dot4_i32_i8 v1, s0, s1, v1
303; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
304; GFX10-DL-NEXT:    s_endpgm
305                                       <4 x i8> addrspace(1)* %src2,
306                                       i16 addrspace(1)* nocapture %dst) {
307entry:
308  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
309  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
310
311  %v1e0 = extractelement <4 x i8> %vec1, i64 0
312  %cv1e0 = sext i8 %v1e0 to i16
313  %v2e0 = extractelement <4 x i8> %vec2, i64 0
314  %cv2e0 = sext i8 %v2e0 to i16
315  %mul1 = mul nsw i16 %cv1e0, %cv2e0
316
317  %v1e1 = extractelement <4 x i8> %vec1, i64 1
318  %cv1e1 = sext i8 %v1e1 to i16
319  %v2e1 = extractelement <4 x i8> %vec2, i64 1
320  %cv2e1 = sext i8 %v2e1 to i16
321  %mul2 = mul nsw i16 %cv1e1, %cv2e1
322
323  %v1e2 = extractelement <4 x i8> %vec1, i64 2
324  %cv1e2 = sext i8 %v1e2 to i16
325  %v2e2 = extractelement <4 x i8> %vec2, i64 2
326  %cv2e2 = sext i8 %v2e2 to i16
327  %mul3 = mul nsw i16 %cv1e2, %cv2e2
328
329  %v1e3 = extractelement <4 x i8> %vec1, i64 3
330  %cv1e3 = sext i8 %v1e3 to i16
331  %v2e3 = extractelement <4 x i8> %vec2, i64 3
332  %cv2e3 = sext i8 %v2e3 to i16
333  %mul4 = mul nsw i16 %cv1e3, %cv2e3
334
335  %acc = load i16, i16 addrspace(1)* %dst, align 2
336  %add1 = add i16 %mul1, %acc
337  %add2 = add i16 %add1, %mul2
338  %add3 = add i16 %add2, %mul3
339  %add4 = add i16 %add3, %mul4
340  store i16 %add4, i16 addrspace(1)* %dst, align 2
341  ret void
342}
343
344define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
345; GFX7-LABEL: idot4_acc8:
346; GFX7:       ; %bb.0: ; %entry
347; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
348; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
349; GFX7-NEXT:    s_mov_b32 s3, 0xf000
350; GFX7-NEXT:    s_mov_b32 s2, -1
351; GFX7-NEXT:    s_movk_i32 s8, 0xff
352; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
354; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
355; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
356; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX7-NEXT:    s_and_b32 s7, s4, s8
358; GFX7-NEXT:    s_and_b32 s6, s5, s8
359; GFX7-NEXT:    s_bfe_u32 s8, s5, 0x80008
360; GFX7-NEXT:    v_mov_b32_e32 v1, s6
361; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80010
362; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
363; GFX7-NEXT:    v_mov_b32_e32 v2, s8
364; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
365; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
366; GFX7-NEXT:    v_mov_b32_e32 v3, s10
367; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
368; GFX7-NEXT:    s_waitcnt vmcnt(0)
369; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
370; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
371; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
372; GFX7-NEXT:    v_mov_b32_e32 v1, s5
373; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
374; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
375; GFX7-NEXT:    s_endpgm
376;
377; GFX8-LABEL: idot4_acc8:
378; GFX8:       ; %bb.0: ; %entry
379; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
380; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
381; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
382; GFX8-NEXT:    v_mov_b32_e32 v0, s0
383; GFX8-NEXT:    v_mov_b32_e32 v1, s1
384; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
385; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
386; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
387; GFX8-NEXT:    s_movk_i32 s0, 0xff
388; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
390; GFX8-NEXT:    s_and_b32 s3, s2, s0
391; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x80008
392; GFX8-NEXT:    s_and_b32 s0, s1, s0
393; GFX8-NEXT:    v_mov_b32_e32 v3, s3
394; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x80010
395; GFX8-NEXT:    v_mov_b32_e32 v4, s4
396; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x80010
397; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
398; GFX8-NEXT:    v_mov_b32_e32 v5, s6
399; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
400; GFX8-NEXT:    s_waitcnt vmcnt(0)
401; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
402; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
403; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
404; GFX8-NEXT:    v_mov_b32_e32 v3, s2
405; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
406; GFX8-NEXT:    flat_store_byte v[0:1], v2
407; GFX8-NEXT:    s_endpgm
408;
409; GFX9-NODL-LABEL: idot4_acc8:
410; GFX9-NODL:       ; %bb.0: ; %entry
411; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
412; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
413; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
414; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
415; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
416; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
417; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
418; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
419; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
421; GFX9-NODL-NEXT:    s_and_b32 s5, s4, s2
422; GFX9-NODL-NEXT:    s_bfe_u32 s6, s4, 0x80008
423; GFX9-NODL-NEXT:    s_and_b32 s2, s3, s2
424; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
425; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80010
426; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
427; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
428; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
429; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
430; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
431; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
432; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
433; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v3, v1
434; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
435; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
436; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
437; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
438; GFX9-NODL-NEXT:    s_endpgm
439;
440; GFX9-DL-LABEL: idot4_acc8:
441; GFX9-DL:       ; %bb.0: ; %entry
442; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
443; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
444; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
445; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
447; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
448; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
449; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
451; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
452; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s2, v2, v1
453; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
454; GFX9-DL-NEXT:    s_endpgm
455;
456; GFX10-DL-LABEL: idot4_acc8:
457; GFX10-DL:       ; %bb.0: ; %entry
458; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
459; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
460; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
461; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
463; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
464; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
465; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
466; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, s0, s1, v1
467; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
468; GFX10-DL-NEXT:    s_endpgm
469                                      <4 x i8> addrspace(1)* %src2,
470                                      i8 addrspace(1)* nocapture %dst) {
471entry:
472  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
473  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
474
475  %v1e0 = extractelement <4 x i8> %vec1, i64 0
476  %v2e0 = extractelement <4 x i8> %vec2, i64 0
477  %mul1 = mul i8 %v1e0, %v2e0
478
479  %v1e1 = extractelement <4 x i8> %vec1, i64 1
480  %v2e1 = extractelement <4 x i8> %vec2, i64 1
481  %mul2 = mul i8 %v1e1, %v2e1
482
483  %v1e2 = extractelement <4 x i8> %vec1, i64 2
484  %v2e2 = extractelement <4 x i8> %vec2, i64 2
485  %mul3 = mul i8 %v1e2, %v2e2
486
487  %v1e3 = extractelement <4 x i8> %vec1, i64 3
488  %v2e3 = extractelement <4 x i8> %vec2, i64 3
489  %mul4 = mul i8 %v1e3, %v2e3
490
491  %acc = load i8, i8 addrspace(1)* %dst, align 2
492  %add1 = add i8 %mul1, %acc
493  %add2 = add i8 %add1, %mul2
494  %add3 = add i8 %add2, %mul3
495  %add4 = add nsw i8 %add3, %mul4
496  store i8 %add4, i8 addrspace(1)* %dst, align 2
497  ret void
498}
499
500define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
501; GFX7-LABEL: idot4_multiuse_mul1:
502; GFX7:       ; %bb.0: ; %entry
503; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
504; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
505; GFX7-NEXT:    s_mov_b32 s3, 0xf000
506; GFX7-NEXT:    s_mov_b32 s2, -1
507; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
509; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
510; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
511; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX7-NEXT:    s_sext_i32_i8 s6, s4
513; GFX7-NEXT:    s_sext_i32_i8 s7, s5
514; GFX7-NEXT:    s_bfe_i32 s9, s5, 0x80008
515; GFX7-NEXT:    v_mov_b32_e32 v0, s7
516; GFX7-NEXT:    v_mov_b32_e32 v1, s12
517; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
518; GFX7-NEXT:    v_mad_i32_i24 v1, s6, v0, v1
519; GFX7-NEXT:    v_mov_b32_e32 v2, s9
520; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80010
521; GFX7-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
522; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x80010
523; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v0, v1
524; GFX7-NEXT:    v_mov_b32_e32 v1, s11
525; GFX7-NEXT:    s_ashr_i32 s5, s5, 24
526; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
527; GFX7-NEXT:    s_ashr_i32 s4, s4, 24
528; GFX7-NEXT:    v_mov_b32_e32 v1, s5
529; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v1, v0
530; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
531; GFX7-NEXT:    s_endpgm
532;
533; GFX8-LABEL: idot4_multiuse_mul1:
534; GFX8:       ; %bb.0: ; %entry
535; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
536; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
537; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
539; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
540; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
541; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX8-NEXT:    s_sext_i32_i8 s4, s2
543; GFX8-NEXT:    s_sext_i32_i8 s5, s3
544; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x80008
545; GFX8-NEXT:    v_mov_b32_e32 v0, s5
546; GFX8-NEXT:    v_mov_b32_e32 v1, s10
547; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x80008
548; GFX8-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
549; GFX8-NEXT:    v_mov_b32_e32 v2, s7
550; GFX8-NEXT:    s_bfe_i32 s9, s3, 0x80010
551; GFX8-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
552; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x80010
553; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
554; GFX8-NEXT:    v_mov_b32_e32 v1, s9
555; GFX8-NEXT:    s_ashr_i32 s3, s3, 24
556; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
557; GFX8-NEXT:    s_ashr_i32 s2, s2, 24
558; GFX8-NEXT:    v_mov_b32_e32 v1, s3
559; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
560; GFX8-NEXT:    v_mov_b32_e32 v0, s0
561; GFX8-NEXT:    v_mov_b32_e32 v1, s1
562; GFX8-NEXT:    flat_store_dword v[0:1], v2
563; GFX8-NEXT:    s_endpgm
564;
565; GFX9-NODL-LABEL: idot4_multiuse_mul1:
566; GFX9-NODL:       ; %bb.0: ; %entry
567; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
568; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
569; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
570; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
572; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
573; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
574; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
576; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
577; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80008
578; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
579; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
580; GFX9-NODL-NEXT:    s_bfe_i32 s6, s2, 0x80008
581; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v1, v2
582; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
583; GFX9-NODL-NEXT:    s_bfe_i32 s9, s3, 0x80010
584; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
585; GFX9-NODL-NEXT:    s_bfe_i32 s8, s2, 0x80010
586; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
587; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
588; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
589; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
590; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
591; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
592; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
593; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
594; GFX9-NODL-NEXT:    s_endpgm
595;
596; GFX9-DL-LABEL: idot4_multiuse_mul1:
597; GFX9-DL:       ; %bb.0: ; %entry
598; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
599; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
600; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
601; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
602; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
603; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
604; GFX9-DL-NEXT:    s_load_dword s10, s[0:1], 0x0
605; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
606; GFX9-DL-NEXT:    s_sext_i32_i8 s4, s2
607; GFX9-DL-NEXT:    s_sext_i32_i8 s5, s3
608; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x80008
609; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
610; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
611; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x80008
612; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v1, v2
613; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
614; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x80010
615; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
616; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x80010
617; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
618; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s9
619; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 24
620; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
621; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 24
622; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
623; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
624; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
625; GFX9-DL-NEXT:    s_endpgm
626;
627; GFX10-DL-LABEL: idot4_multiuse_mul1:
628; GFX10-DL:       ; %bb.0: ; %entry
629; GFX10-DL-NEXT:    s_clause 0x1
630; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
631; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
632; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
633; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
635; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
636; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
637; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
639; GFX10-DL-NEXT:    s_sext_i32_i8 s2, s0
640; GFX10-DL-NEXT:    s_sext_i32_i8 s3, s1
641; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x80008
642; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x80008
643; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
644; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s7, v0
645; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
646; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x80010
647; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x80010
648; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 24
649; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 24
650; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
651; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s0, s1, v0
652; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
653; GFX10-DL-NEXT:    s_endpgm
654                                               <4 x i8> addrspace(1)* %src2,
655                                               i32 addrspace(1)* nocapture %dst) {
656entry:
657  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
658  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
659
660  %v1e0 = extractelement <4 x i8> %vec1, i64 0
661  %cv1e0 = sext i8 %v1e0 to i32
662  %v2e0 = extractelement <4 x i8> %vec2, i64 0
663  %cv2e0 = sext i8 %v2e0 to i32
664  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
665
666  %v1e1 = extractelement <4 x i8> %vec1, i64 1
667  %cv1e1 = sext i8 %v1e1 to i32
668  %v2e1 = extractelement <4 x i8> %vec2, i64 1
669  %cv2e1 = sext i8 %v2e1 to i32
670  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
671
672  %v1e2 = extractelement <4 x i8> %vec1, i64 2
673  %cv1e2 = sext i8 %v1e2 to i32
674  %v2e2 = extractelement <4 x i8> %vec2, i64 2
675  %cv2e2 = sext i8 %v2e2 to i32
676  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
677
678  %v1e3 = extractelement <4 x i8> %vec1, i64 3
679  %cv1e3 = sext i8 %v1e3 to i32
680  %v2e3 = extractelement <4 x i8> %vec2, i64 3
681  %cv2e3 = sext i8 %v2e3 to i32
682  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
683
684  %acc = load i32, i32 addrspace(1)* %dst, align 4
685  %add = add i32 %mul1, %acc
686  %add1 = add i32 %mul2, %add
687  %add2 = add i32 %add1, %mul1
688  %add3 = add i32 %add2, %mul3
689  %add4 = add i32 %add3, %mul4
690
691  store i32 %add4, i32 addrspace(1)* %dst, align 4
692  ret void
693}
694
695; TODO: Support this pattern.
696define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
697; GFX7-LABEL: idot4_acc32_vecMul:
698; GFX7:       ; %bb.0: ; %entry
699; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
700; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
701; GFX7-NEXT:    s_mov_b32 s3, 0xf000
702; GFX7-NEXT:    s_mov_b32 s2, -1
703; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
705; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
706; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
707; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
708; GFX7-NEXT:    s_ashr_i32 s6, s4, 24
709; GFX7-NEXT:    s_ashr_i32 s9, s5, 24
710; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80010
711; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80008
712; GFX7-NEXT:    s_sext_i32_i8 s5, s5
713; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x80010
714; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
715; GFX7-NEXT:    s_sext_i32_i8 s4, s4
716; GFX7-NEXT:    v_mov_b32_e32 v0, s5
717; GFX7-NEXT:    v_mov_b32_e32 v1, s12
718; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
719; GFX7-NEXT:    v_mov_b32_e32 v1, s11
720; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
721; GFX7-NEXT:    v_mov_b32_e32 v1, s10
722; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
723; GFX7-NEXT:    v_mov_b32_e32 v1, s9
724; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
725; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
726; GFX7-NEXT:    s_endpgm
727;
728; GFX8-LABEL: idot4_acc32_vecMul:
729; GFX8:       ; %bb.0: ; %entry
730; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
731; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
732; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
734; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
735; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x0
736; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
737; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
738; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
739; GFX8-NEXT:    s_ashr_i32 s6, s3, 24
740; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x80010
741; GFX8-NEXT:    s_sext_i32_i8 s3, s3
742; GFX8-NEXT:    s_ashr_i32 s4, s2, 24
743; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x80010
744; GFX8-NEXT:    s_sext_i32_i8 s2, s2
745; GFX8-NEXT:    v_mov_b32_e32 v2, s3
746; GFX8-NEXT:    v_mov_b32_e32 v3, s8
747; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
748; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
749; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v2, v3
750; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
751; GFX8-NEXT:    v_mov_b32_e32 v1, s7
752; GFX8-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
753; GFX8-NEXT:    v_mov_b32_e32 v1, s6
754; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
755; GFX8-NEXT:    v_mov_b32_e32 v0, s0
756; GFX8-NEXT:    v_mov_b32_e32 v1, s1
757; GFX8-NEXT:    flat_store_dword v[0:1], v2
758; GFX8-NEXT:    s_endpgm
759;
760; GFX9-NODL-LABEL: idot4_acc32_vecMul:
761; GFX9-NODL:       ; %bb.0: ; %entry
762; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
763; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
764; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
765; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
767; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
768; GFX9-NODL-NEXT:    s_load_dword s8, s[0:1], 0x0
769; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
771; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
772; GFX9-NODL-NEXT:    s_ashr_i32 s6, s3, 24
773; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80010
774; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s3
775; GFX9-NODL-NEXT:    s_ashr_i32 s4, s2, 24
776; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80010
777; GFX9-NODL-NEXT:    s_sext_i32_i8 s2, s2
778; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
779; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
780; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
781; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
782; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, s2, v3, v4
783; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v1, v2, v3
784; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
785; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s5, v2, v1
786; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
787; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v2, v1
788; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
789; GFX9-NODL-NEXT:    s_endpgm
790;
791; GFX9-DL-LABEL: idot4_acc32_vecMul:
792; GFX9-DL:       ; %bb.0: ; %entry
793; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
794; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
795; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
796; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
797; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
798; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
799; GFX9-DL-NEXT:    s_load_dword s8, s[0:1], 0x0
800; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
801; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
802; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
803; GFX9-DL-NEXT:    s_ashr_i32 s6, s3, 24
804; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x80010
805; GFX9-DL-NEXT:    s_sext_i32_i8 s3, s3
806; GFX9-DL-NEXT:    s_ashr_i32 s4, s2, 24
807; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x80010
808; GFX9-DL-NEXT:    s_sext_i32_i8 s2, s2
809; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
810; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s8
811; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
812; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
813; GFX9-DL-NEXT:    v_mad_i32_i24 v3, s2, v3, v4
814; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v1, v2, v3
815; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
816; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s5, v2, v1
817; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
818; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v2, v1
819; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
820; GFX9-DL-NEXT:    s_endpgm
821;
822; GFX10-DL-LABEL: idot4_acc32_vecMul:
823; GFX10-DL:       ; %bb.0: ; %entry
824; GFX10-DL-NEXT:    s_clause 0x1
825; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
826; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
827; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
828; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
829; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
830; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
831; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
832; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
833; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
834; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
835; GFX10-DL-NEXT:    s_sext_i32_i8 s4, s2
836; GFX10-DL-NEXT:    s_sext_i32_i8 s5, s3
837; GFX10-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
838; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
839; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
840; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x80010
841; GFX10-DL-NEXT:    s_bfe_i32 s5, s3, 0x80010
842; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 24
843; GFX10-DL-NEXT:    s_ashr_i32 s3, s3, 24
844; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
845; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
846; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s4, s5, v0
847; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
848; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
849; GFX10-DL-NEXT:    s_endpgm
850                                              <4 x i8> addrspace(1)* %src2,
851                                              i32 addrspace(1)* nocapture %dst) {
852entry:
853  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
854  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
855
856  %cvec1 = sext <4 x i8> %vec1 to <4 x i32>
857  %cvec2 = sext <4 x i8> %vec2 to <4 x i32>
858
859  %mul = mul <4 x i32> %cvec1, %cvec2
860  %mul0 = extractelement <4 x i32> %mul, i64 0
861  %mul1 = extractelement <4 x i32> %mul, i64 1
862  %mul2 = extractelement <4 x i32> %mul, i64 2
863  %mul3 = extractelement <4 x i32> %mul, i64 3
864
865  %acc = load i32, i32 addrspace(1)* %dst, align 4
866  %add1 = add i32 %mul0, %acc
867  %add2 = add i32 %add1, %mul1
868  %add3 = add i32 %add2, %mul2
869  %add4 = add i32 %add3, %mul3
870
871  store i32 %add4, i32 addrspace(1)* %dst, align 4
872  ret void
873}
874
875define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
876; GFX7-LABEL: idot4_acc16_vecMul:
877; GFX7:       ; %bb.0: ; %entry
878; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
879; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
880; GFX7-NEXT:    s_mov_b32 s3, 0xf000
881; GFX7-NEXT:    s_mov_b32 s2, -1
882; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
884; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
885; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
886; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX7-NEXT:    s_ashr_i32 s6, s4, 24
888; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80010
889; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80008
890; GFX7-NEXT:    s_ashr_i32 s9, s5, 24
891; GFX7-NEXT:    s_sext_i32_i8 s5, s5
892; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x80010
893; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
894; GFX7-NEXT:    s_sext_i32_i8 s4, s4
895; GFX7-NEXT:    v_mov_b32_e32 v1, s5
896; GFX7-NEXT:    v_mov_b32_e32 v2, s11
897; GFX7-NEXT:    v_mov_b32_e32 v3, s10
898; GFX7-NEXT:    s_waitcnt vmcnt(0)
899; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v1, v0
900; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
901; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v3, v0
902; GFX7-NEXT:    v_mov_b32_e32 v1, s9
903; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
904; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
905; GFX7-NEXT:    s_endpgm
906;
907; GFX8-LABEL: idot4_acc16_vecMul:
908; GFX8:       ; %bb.0: ; %entry
909; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
910; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
911; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
912; GFX8-NEXT:    v_mov_b32_e32 v0, s0
913; GFX8-NEXT:    v_mov_b32_e32 v1, s1
914; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
915; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
916; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
917; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
918; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
919; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
920; GFX8-NEXT:    s_bfe_i32 s5, s1, 0x80010
921; GFX8-NEXT:    s_ashr_i32 s4, s1, 24
922; GFX8-NEXT:    s_sext_i32_i8 s1, s1
923; GFX8-NEXT:    s_ashr_i32 s2, s0, 24
924; GFX8-NEXT:    s_bfe_i32 s3, s0, 0x80010
925; GFX8-NEXT:    s_sext_i32_i8 s0, s0
926; GFX8-NEXT:    v_mov_b32_e32 v5, s1
927; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
928; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 8
929; GFX8-NEXT:    v_mov_b32_e32 v6, s5
930; GFX8-NEXT:    s_waitcnt vmcnt(0)
931; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v5, v2
932; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v4, v2
933; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v6, v2
934; GFX8-NEXT:    v_mov_b32_e32 v3, s4
935; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
936; GFX8-NEXT:    flat_store_short v[0:1], v2
937; GFX8-NEXT:    s_endpgm
938;
939; GFX9-NODL-LABEL: idot4_acc16_vecMul:
940; GFX9-NODL:       ; %bb.0: ; %entry
941; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
942; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
943; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, 0xffff
944; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
945; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
947; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
948; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
949; GFX9-NODL-NEXT:    s_lshr_b32 s4, s2, 16
950; GFX9-NODL-NEXT:    s_lshr_b32 s5, s3, 16
951; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v4, 8, s5
952; GFX9-NODL-NEXT:    s_bfe_i32 s5, s5, 0x80000
953; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v3, 8, s4
954; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s5, v5
955; GFX9-NODL-NEXT:    s_bfe_i32 s4, s4, 0x80000
956; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v4, 16, v6
957; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s4, v5
958; GFX9-NODL-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
959; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v2, 8, s3
960; GFX9-NODL-NEXT:    s_bfe_i32 s3, s3, 0x80000
961; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v1, 8, s2
962; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
963; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s3, v5
964; GFX9-NODL-NEXT:    s_bfe_i32 s2, s2, 0x80000
965; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
966; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s2, v5
967; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
968; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
969; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[0:1]
970; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
971; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v1, v2
972; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
973; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v3
974; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
975; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
976; GFX9-NODL-NEXT:    s_endpgm
977;
978; GFX9-DL-LABEL: idot4_acc16_vecMul:
979; GFX9-DL:       ; %bb.0: ; %entry
980; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
981; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
982; GFX9-DL-NEXT:    v_mov_b32_e32 v5, 0xffff
983; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
984; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
985; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
986; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
987; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 16
989; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 16
990; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v4, 8, s5
991; GFX9-DL-NEXT:    s_bfe_i32 s5, s5, 0x80000
992; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v3, 8, s4
993; GFX9-DL-NEXT:    v_and_b32_e32 v6, s5, v5
994; GFX9-DL-NEXT:    s_bfe_i32 s4, s4, 0x80000
995; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v6
996; GFX9-DL-NEXT:    v_and_b32_e32 v6, s4, v5
997; GFX9-DL-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
998; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v2, 8, s3
999; GFX9-DL-NEXT:    s_bfe_i32 s3, s3, 0x80000
1000; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v1, 8, s2
1001; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
1002; GFX9-DL-NEXT:    v_and_b32_e32 v4, s3, v5
1003; GFX9-DL-NEXT:    s_bfe_i32 s2, s2, 0x80000
1004; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
1005; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v5
1006; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
1007; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1008; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[0:1]
1009; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1010; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v2
1011; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1012; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v3
1013; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1014; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
1015; GFX9-DL-NEXT:    s_endpgm
1016;
1017; GFX10-DL-LABEL: idot4_acc16_vecMul:
1018; GFX10-DL:       ; %bb.0: ; %entry
1019; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1020; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1021; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1022; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
1023; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
1025; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1026; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1027; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 16
1029; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 8, s0
1030; GFX10-DL-NEXT:    s_bfe_i32 s0, s0, 0x80000
1031; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x80000
1032; GFX10-DL-NEXT:    v_and_b32_e32 v6, s0, v2
1033; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 8, s1
1034; GFX10-DL-NEXT:    v_and_b32_e32 v5, s3, v2
1035; GFX10-DL-NEXT:    s_lshr_b32 s0, s1, 16
1036; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 8, s2
1037; GFX10-DL-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
1038; GFX10-DL-NEXT:    s_bfe_i32 s1, s2, 0x80000
1039; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
1040; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x80000
1041; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 8, s0
1042; GFX10-DL-NEXT:    v_and_b32_e32 v6, s2, v2
1043; GFX10-DL-NEXT:    v_and_b32_e32 v2, s1, v2
1044; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
1045; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v5, 16, v6
1046; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
1047; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
1048; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1049; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v3, v1
1050; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1051; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1052; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1053; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
1054; GFX10-DL-NEXT:    s_endpgm
1055                                              <4 x i8> addrspace(1)* %src2,
1056                                              i16 addrspace(1)* nocapture %dst) {
1057entry:
1058  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1059  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1060
1061  %cvec1 = sext <4 x i8> %vec1 to <4 x i16>
1062  %cvec2 = sext <4 x i8> %vec2 to <4 x i16>
1063
1064  %mul = mul <4 x i16> %cvec1, %cvec2
1065  %mul0 = extractelement <4 x i16> %mul, i64 0
1066  %mul1 = extractelement <4 x i16> %mul, i64 1
1067  %mul2 = extractelement <4 x i16> %mul, i64 2
1068  %mul3 = extractelement <4 x i16> %mul, i64 3
1069
1070  %acc = load i16, i16 addrspace(1)* %dst, align 4
1071  %add1 = add i16 %mul0, %acc
1072  %add2 = add i16 %add1, %mul1
1073  %add3 = add i16 %add2, %mul2
1074  %add4 = add i16 %add3, %mul3
1075
1076  store i16 %add4, i16 addrspace(1)* %dst, align 4
1077  ret void
1078}
1079