1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
8
9; add(mul(S0.x, S1.y),
10;     add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
11
12define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
13; GFX7-LABEL: udot2:
14; GFX7:       ; %bb.0: ; %entry
15; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
16; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
17; GFX7-NEXT:    s_mov_b32 s8, 0xffff
18; GFX7-NEXT:    s_mov_b32 s3, 0xf000
19; GFX7-NEXT:    s_mov_b32 s2, -1
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
22; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
23; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
24; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
25; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
26; GFX7-NEXT:    s_and_b32 s4, s4, s8
27; GFX7-NEXT:    s_and_b32 s5, s5, s8
28; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
29; GFX7-NEXT:    v_mov_b32_e32 v0, s6
30; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX7-NEXT:    v_mov_b32_e32 v1, s8
32; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
33; GFX7-NEXT:    v_mov_b32_e32 v1, s4
34; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
35; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
36; GFX7-NEXT:    s_endpgm
37;
38; GFX8-LABEL: udot2:
39; GFX8:       ; %bb.0: ; %entry
40; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
41; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
42; GFX8-NEXT:    s_mov_b32 s2, 0xffff
43; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
45; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
46; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
47; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX8-NEXT:    s_and_b32 s6, s3, s2
49; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
50; GFX8-NEXT:    s_and_b32 s2, s4, s2
51; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
52; GFX8-NEXT:    v_mov_b32_e32 v0, s5
53; GFX8-NEXT:    v_mov_b32_e32 v1, s3
54; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
55; GFX8-NEXT:    v_mov_b32_e32 v1, s6
56; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
57; GFX8-NEXT:    v_mov_b32_e32 v0, s0
58; GFX8-NEXT:    v_mov_b32_e32 v1, s1
59; GFX8-NEXT:    flat_store_dword v[0:1], v2
60; GFX8-NEXT:    s_endpgm
61;
62; GFX9-NODL-LABEL: udot2:
63; GFX9-NODL:       ; %bb.0: ; %entry
64; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
65; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
66; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
67; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
68; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
70; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
71; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
72; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
74; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
75; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
76; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
77; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
78; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
79; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
80; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
81; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
82; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
83; GFX9-NODL-NEXT:    s_endpgm
84;
85; GFX9-DL-LABEL: udot2:
86; GFX9-DL:       ; %bb.0: ; %entry
87; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
88; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
89; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
90; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
91; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
92; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
93; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
94; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
96; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
97; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s4, v1, v2
98; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
99; GFX9-DL-NEXT:    s_endpgm
100;
101; GFX10-DL-LABEL: udot2:
102; GFX10-DL:       ; %bb.0: ; %entry
103; GFX10-DL-NEXT:    s_clause 0x1
104; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
105; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
106; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
107; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
109; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
110; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
111; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
112; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
113; GFX10-DL-NEXT:    v_dot2_u32_u16 v0, s1, s0, v0
114; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
115; GFX10-DL-NEXT:    s_endpgm
116                                 <2 x i16> addrspace(1)* %src2,
117                                 i32 addrspace(1)* nocapture %dst) {
118entry:
119  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
120  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
121
122  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
123  %conv = zext i16 %s1.elt1 to i32
124  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
125  %conv2 = zext i16 %s2.elt1 to i32
126  %mul1 = mul nuw i32 %conv2, %conv
127
128  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
129  %conv3 = zext i16 %s1.elt2 to i32
130  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
131  %conv4 = zext i16 %s2.elt2 to i32
132  %mul2 = mul nuw i32 %conv4, %conv3
133
134  %s3 = load i32, i32 addrspace(1)* %dst, align 4
135  %add = add i32 %mul2, %s3
136  %add6 = add i32 %add, %mul1
137  store i32 %add6, i32 addrspace(1)* %dst, align 4
138  ret void
139}
140
141; TODO: Support this pattern
142;      add(S3,
143;          add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
144define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
145; GFX7-LABEL: udot2_MulMul:
146; GFX7:       ; %bb.0: ; %entry
147; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
148; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
149; GFX7-NEXT:    s_mov_b32 s8, 0xffff
150; GFX7-NEXT:    s_mov_b32 s3, 0xf000
151; GFX7-NEXT:    s_mov_b32 s2, -1
152; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
154; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
155; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
157; GFX7-NEXT:    s_and_b32 s4, s4, s8
158; GFX7-NEXT:    v_mov_b32_e32 v0, s4
159; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
160; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
161; GFX7-NEXT:    s_and_b32 s5, s5, s8
162; GFX7-NEXT:    v_mul_u32_u24_e32 v0, s5, v0
163; GFX7-NEXT:    v_mov_b32_e32 v1, s6
164; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
165; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
167; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
168; GFX7-NEXT:    s_endpgm
169;
170; GFX8-LABEL: udot2_MulMul:
171; GFX8:       ; %bb.0: ; %entry
172; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
173; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
174; GFX8-NEXT:    s_mov_b32 s2, 0xffff
175; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
176; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
177; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
178; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
179; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX8-NEXT:    s_and_b32 s6, s3, s2
181; GFX8-NEXT:    s_and_b32 s2, s4, s2
182; GFX8-NEXT:    v_mov_b32_e32 v0, s6
183; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
184; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
185; GFX8-NEXT:    v_mov_b32_e32 v1, s3
186; GFX8-NEXT:    v_mul_u32_u24_e32 v0, s2, v0
187; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
188; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s5, v0
189; GFX8-NEXT:    v_mov_b32_e32 v0, s0
190; GFX8-NEXT:    v_mov_b32_e32 v1, s1
191; GFX8-NEXT:    flat_store_dword v[0:1], v2
192; GFX8-NEXT:    s_endpgm
193;
194; GFX9-NODL-LABEL: udot2_MulMul:
195; GFX9-NODL:       ; %bb.0: ; %entry
196; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
197; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
198; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
199; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
200; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
202; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
203; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
204; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
206; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
207; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
208; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
209; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
210; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v1, s2, v1
211; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
212; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
213; GFX9-NODL-NEXT:    v_add_u32_e32 v1, s5, v1
214; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
215; GFX9-NODL-NEXT:    s_endpgm
216;
217; GFX9-DL-LABEL: udot2_MulMul:
218; GFX9-DL:       ; %bb.0: ; %entry
219; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
220; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
221; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
222; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
223; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
225; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
226; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
227; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
229; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
230; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
231; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
232; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
233; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v1, s2, v1
234; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
235; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
236; GFX9-DL-NEXT:    v_add_u32_e32 v1, s5, v1
237; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
238; GFX9-DL-NEXT:    s_endpgm
239;
240; GFX10-DL-LABEL: udot2_MulMul:
241; GFX10-DL:       ; %bb.0: ; %entry
242; GFX10-DL-NEXT:    s_clause 0x1
243; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
244; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
245; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
246; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
247; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
248; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
249; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
250; GFX10-DL-NEXT:    s_mov_b32 s5, 0xffff
251; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX10-DL-NEXT:    s_and_b32 s6, s2, s5
253; GFX10-DL-NEXT:    s_and_b32 s5, s3, s5
254; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
255; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v0, s5, s6
256; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 16
257; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
258; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, s4, v0
259; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
260; GFX10-DL-NEXT:    s_endpgm
261                                        <2 x i16> addrspace(1)* %src2,
262                                        i32 addrspace(1)* nocapture %dst) {
263entry:
264  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
265  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
266
267  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
268  %conv = zext i16 %s1.elt1 to i32
269  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
270  %conv2 = zext i16 %s2.elt1 to i32
271  %mul1 = mul nuw i32 %conv2, %conv
272
273  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
274  %conv3 = zext i16 %s1.elt2 to i32
275  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
276  %conv4 = zext i16 %s2.elt2 to i32
277  %mul2 = mul nuw i32 %conv4, %conv3
278  %s3 = load i32, i32 addrspace(1)* %dst, align 4
279  %add = add i32 %mul2, %mul1
280  %add6 = add i32 %add, %s3
281  store i32 %add6, i32 addrspace(1)* %dst, align 4
282  ret void
283}
284
285define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
286; GFX7-LABEL: idot2:
287; GFX7:       ; %bb.0: ; %entry
288; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
289; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
290; GFX7-NEXT:    s_mov_b32 s3, 0xf000
291; GFX7-NEXT:    s_mov_b32 s2, -1
292; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
294; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
295; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
296; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX7-NEXT:    s_sext_i32_i16 s7, s4
298; GFX7-NEXT:    s_ashr_i32 s4, s4, 16
299; GFX7-NEXT:    s_sext_i32_i16 s8, s5
300; GFX7-NEXT:    s_ashr_i32 s5, s5, 16
301; GFX7-NEXT:    v_mov_b32_e32 v0, s4
302; GFX7-NEXT:    v_mov_b32_e32 v1, s6
303; GFX7-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
304; GFX7-NEXT:    v_mov_b32_e32 v1, s7
305; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
306; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
307; GFX7-NEXT:    s_endpgm
308;
309; GFX8-LABEL: idot2:
310; GFX8:       ; %bb.0: ; %entry
311; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
312; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
313; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
315; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
316; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
317; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX8-NEXT:    s_sext_i32_i16 s5, s2
319; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
320; GFX8-NEXT:    s_sext_i32_i16 s6, s3
321; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
322; GFX8-NEXT:    v_mov_b32_e32 v0, s4
323; GFX8-NEXT:    v_mov_b32_e32 v1, s2
324; GFX8-NEXT:    v_mov_b32_e32 v2, s5
325; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
326; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
327; GFX8-NEXT:    v_mov_b32_e32 v0, s0
328; GFX8-NEXT:    v_mov_b32_e32 v1, s1
329; GFX8-NEXT:    flat_store_dword v[0:1], v2
330; GFX8-NEXT:    s_endpgm
331;
332; GFX9-NODL-LABEL: idot2:
333; GFX9-NODL:       ; %bb.0: ; %entry
334; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
335; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
336; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
337; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
339; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
340; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
341; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
342; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
343; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
344; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
345; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
346; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
347; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
348; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
349; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
350; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
351; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
352; GFX9-NODL-NEXT:    s_endpgm
353;
354; GFX9-DL-LABEL: idot2:
355; GFX9-DL:       ; %bb.0: ; %entry
356; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
357; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
358; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
359; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
361; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
362; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
363; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
364; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
365; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
366; GFX9-DL-NEXT:    v_dot2_i32_i16 v1, s4, v1, v2
367; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
368; GFX9-DL-NEXT:    s_endpgm
369;
370; GFX10-DL-LABEL: idot2:
371; GFX10-DL:       ; %bb.0: ; %entry
372; GFX10-DL-NEXT:    s_clause 0x1
373; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
374; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
375; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
376; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
378; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
379; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
380; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
382; GFX10-DL-NEXT:    v_dot2_i32_i16 v0, s1, s0, v0
383; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
384; GFX10-DL-NEXT:    s_endpgm
385                                 <2 x i16> addrspace(1)* %src2,
386                                 i32 addrspace(1)* nocapture %dst) {
387entry:
388  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
389  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
390
391  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
392  %conv = sext i16 %s1.elt1 to i32
393  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
394  %conv2 = sext i16 %s2.elt1 to i32
395  %mul1 = mul nuw i32 %conv2, %conv
396
397  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
398  %conv3 = sext i16 %s1.elt2 to i32
399  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
400  %conv4 = sext i16 %s2.elt2 to i32
401  %mul2 = mul nuw i32 %conv4, %conv3
402
403  %s3 = load i32, i32 addrspace(1)* %dst, align 4
404  %add = add i32 %mul2, %s3
405  %add6 = add i32 %add, %mul1
406  store i32 %add6, i32 addrspace(1)* %dst, align 4
407  ret void
408}
409
410define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
411; GFX7-LABEL: idot2_MixedTypedMul:
412; GFX7:       ; %bb.0: ; %entry
413; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
414; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
415; GFX7-NEXT:    s_mov_b32 s3, 0xf000
416; GFX7-NEXT:    s_mov_b32 s2, -1
417; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
418; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
419; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
420; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
421; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
423; GFX7-NEXT:    s_lshr_b32 s8, s5, 16
424; GFX7-NEXT:    s_sext_i32_i16 s4, s4
425; GFX7-NEXT:    v_mov_b32_e32 v0, s7
426; GFX7-NEXT:    v_mov_b32_e32 v1, s6
427; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v0, v1
428; GFX7-NEXT:    s_sext_i32_i16 s5, s5
429; GFX7-NEXT:    v_mov_b32_e32 v1, s4
430; GFX7-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
431; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
432; GFX7-NEXT:    s_endpgm
433;
434; GFX8-LABEL: idot2_MixedTypedMul:
435; GFX8:       ; %bb.0: ; %entry
436; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
437; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
438; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
440; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
441; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
442; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX8-NEXT:    s_sext_i32_i16 s5, s2
444; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
445; GFX8-NEXT:    s_sext_i32_i16 s6, s3
446; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
447; GFX8-NEXT:    v_mov_b32_e32 v0, s4
448; GFX8-NEXT:    v_mov_b32_e32 v1, s2
449; GFX8-NEXT:    v_mov_b32_e32 v2, s5
450; GFX8-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
451; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
452; GFX8-NEXT:    v_mov_b32_e32 v0, s0
453; GFX8-NEXT:    v_mov_b32_e32 v1, s1
454; GFX8-NEXT:    flat_store_dword v[0:1], v2
455; GFX8-NEXT:    s_endpgm
456;
457; GFX9-NODL-LABEL: idot2_MixedTypedMul:
458; GFX9-NODL:       ; %bb.0: ; %entry
459; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
460; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
461; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
462; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
464; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
465; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
466; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
467; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
468; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
469; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
470; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
471; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
472; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
473; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
474; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
475; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
476; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
477; GFX9-NODL-NEXT:    s_endpgm
478;
479; GFX9-DL-LABEL: idot2_MixedTypedMul:
480; GFX9-DL:       ; %bb.0: ; %entry
481; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
482; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
483; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
484; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
485; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
486; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
487; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
488; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
490; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 16
491; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
492; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
493; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
494; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
495; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
496; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
497; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
498; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
499; GFX9-DL-NEXT:    s_endpgm
500;
501; GFX10-DL-LABEL: idot2_MixedTypedMul:
502; GFX10-DL:       ; %bb.0: ; %entry
503; GFX10-DL-NEXT:    s_clause 0x1
504; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
505; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
506; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
507; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
509; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
510; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
511; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
513; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 16
514; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 16
515; GFX10-DL-NEXT:    s_sext_i32_i16 s0, s0
516; GFX10-DL-NEXT:    s_sext_i32_i16 s1, s1
517; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
518; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s1, s0, v0
519; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
520; GFX10-DL-NEXT:    s_endpgm
521                                               <2 x i16> addrspace(1)* %src2,
522                                               i32 addrspace(1)* nocapture %dst) {
523entry:
524  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
525  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
526
527  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
528  %conv = sext i16 %s1.elt1 to i32
529  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
530  %conv2 = sext i16 %s2.elt1 to i32
531  %mul1 = mul nuw i32 %conv2, %conv
532
533  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
534  %conv3 = zext i16 %s1.elt2 to i32
535  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
536  %conv4 = zext i16 %s2.elt2 to i32
537  %mul2 = mul nuw i32 %conv4, %conv3
538
539  %s3 = load i32, i32 addrspace(1)* %dst, align 4
540  %add = add i32 %mul2, %s3
541  %add6 = add i32 %add, %mul1
542  store i32 %add6, i32 addrspace(1)* %dst, align 4
543  ret void
544}
545
546define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
547; GFX7-LABEL: udot2_alt_AddOperands:
548; GFX7:       ; %bb.0: ; %entry
549; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
550; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
551; GFX7-NEXT:    s_mov_b32 s8, 0xffff
552; GFX7-NEXT:    s_mov_b32 s3, 0xf000
553; GFX7-NEXT:    s_mov_b32 s2, -1
554; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
556; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
557; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
558; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
559; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
560; GFX7-NEXT:    s_and_b32 s4, s4, s8
561; GFX7-NEXT:    s_and_b32 s5, s5, s8
562; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
563; GFX7-NEXT:    v_mov_b32_e32 v0, s6
564; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX7-NEXT:    v_mov_b32_e32 v1, s8
566; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
567; GFX7-NEXT:    v_mov_b32_e32 v1, s4
568; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
569; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
570; GFX7-NEXT:    s_endpgm
571;
572; GFX8-LABEL: udot2_alt_AddOperands:
573; GFX8:       ; %bb.0: ; %entry
574; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
575; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
576; GFX8-NEXT:    s_mov_b32 s2, 0xffff
577; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
579; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
580; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
581; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX8-NEXT:    s_and_b32 s6, s3, s2
583; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
584; GFX8-NEXT:    s_and_b32 s2, s4, s2
585; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
586; GFX8-NEXT:    v_mov_b32_e32 v0, s5
587; GFX8-NEXT:    v_mov_b32_e32 v1, s3
588; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
589; GFX8-NEXT:    v_mov_b32_e32 v1, s6
590; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
591; GFX8-NEXT:    v_mov_b32_e32 v0, s0
592; GFX8-NEXT:    v_mov_b32_e32 v1, s1
593; GFX8-NEXT:    flat_store_dword v[0:1], v2
594; GFX8-NEXT:    s_endpgm
595;
596; GFX9-NODL-LABEL: udot2_alt_AddOperands:
597; GFX9-NODL:       ; %bb.0: ; %entry
598; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
599; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
600; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
601; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
602; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
603; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
604; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
605; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
606; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
608; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
609; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
610; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
611; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
612; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
613; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
614; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
615; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
616; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
617; GFX9-NODL-NEXT:    s_endpgm
618;
619; GFX9-DL-LABEL: udot2_alt_AddOperands:
620; GFX9-DL:       ; %bb.0: ; %entry
621; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
622; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
623; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
624; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
626; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
627; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
628; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
630; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
631; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s4, v1, v2
632; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
633; GFX9-DL-NEXT:    s_endpgm
634;
635; GFX10-DL-LABEL: udot2_alt_AddOperands:
636; GFX10-DL:       ; %bb.0: ; %entry
637; GFX10-DL-NEXT:    s_clause 0x1
638; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
639; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
640; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
641; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
642; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
643; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
644; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
645; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
647; GFX10-DL-NEXT:    v_dot2_u32_u16 v0, s1, s0, v0
648; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
649; GFX10-DL-NEXT:    s_endpgm
650                                                 <2 x i16> addrspace(1)* %src2,
651                                                 i32 addrspace(1)* nocapture %dst) {
652entry:
653  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
654  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
655
656  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
657  %conv = zext i16 %s1.elt1 to i32
658  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
659  %conv2 = zext i16 %s2.elt1 to i32
660  %mul1 = mul nuw i32 %conv2, %conv
661
662  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
663  %conv3 = zext i16 %s1.elt2 to i32
664  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
665  %conv4 = zext i16 %s2.elt2 to i32
666  %mul2 = mul nuw i32 %conv4, %conv3
667
668  %s3 = load i32, i32 addrspace(1)* %dst, align 4
669  %add = add i32 %s3, %mul2
670  %add6 = add i32 %mul1, %add
671  store i32 %add6, i32 addrspace(1)* %dst, align 4
672  ret void
673}
674
675define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
676; GFX7-LABEL: idot2_MixedExt:
677; GFX7:       ; %bb.0: ; %entry
678; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
679; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
680; GFX7-NEXT:    s_mov_b32 s3, 0xf000
681; GFX7-NEXT:    s_mov_b32 s2, -1
682; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
683; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
684; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
685; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
686; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX7-NEXT:    s_sext_i32_i16 s7, s4
688; GFX7-NEXT:    s_ashr_i32 s4, s4, 16
689; GFX7-NEXT:    s_and_b32 s8, s5, 0xffff
690; GFX7-NEXT:    s_ashr_i32 s5, s5, 16
691; GFX7-NEXT:    v_mov_b32_e32 v0, s4
692; GFX7-NEXT:    v_mov_b32_e32 v1, s6
693; GFX7-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
694; GFX7-NEXT:    v_mov_b32_e32 v1, s7
695; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
696; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
697; GFX7-NEXT:    s_endpgm
698;
699; GFX8-LABEL: idot2_MixedExt:
700; GFX8:       ; %bb.0: ; %entry
701; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
702; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
703; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
705; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
706; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
707; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
708; GFX8-NEXT:    s_sext_i32_i16 s5, s2
709; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
710; GFX8-NEXT:    s_and_b32 s6, s3, 0xffff
711; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
712; GFX8-NEXT:    v_mov_b32_e32 v0, s4
713; GFX8-NEXT:    v_mov_b32_e32 v1, s2
714; GFX8-NEXT:    v_mov_b32_e32 v2, s5
715; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
716; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
717; GFX8-NEXT:    v_mov_b32_e32 v0, s0
718; GFX8-NEXT:    v_mov_b32_e32 v1, s1
719; GFX8-NEXT:    flat_store_dword v[0:1], v2
720; GFX8-NEXT:    s_endpgm
721;
722; GFX9-NODL-LABEL: idot2_MixedExt:
723; GFX9-NODL:       ; %bb.0: ; %entry
724; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
725; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
726; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
727; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
728; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
729; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
730; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
731; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
733; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
734; GFX9-NODL-NEXT:    s_and_b32 s6, s3, 0xffff
735; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
736; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
737; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
738; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
739; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
740; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
741; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
742; GFX9-NODL-NEXT:    s_endpgm
743;
744; GFX9-DL-LABEL: idot2_MixedExt:
745; GFX9-DL:       ; %bb.0: ; %entry
746; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
747; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
748; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
749; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
750; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
751; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
752; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
753; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
754; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
755; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
756; GFX9-DL-NEXT:    s_and_b32 s6, s3, 0xffff
757; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
758; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
759; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
760; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
761; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
762; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
763; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
764; GFX9-DL-NEXT:    s_endpgm
765;
766; GFX10-DL-LABEL: idot2_MixedExt:
767; GFX10-DL:       ; %bb.0: ; %entry
768; GFX10-DL-NEXT:    s_clause 0x1
769; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
770; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
771; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
772; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
774; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
775; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
776; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
777; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
778; GFX10-DL-NEXT:    s_ashr_i32 s2, s0, 16
779; GFX10-DL-NEXT:    s_ashr_i32 s3, s1, 16
780; GFX10-DL-NEXT:    s_sext_i32_i16 s0, s0
781; GFX10-DL-NEXT:    s_and_b32 s1, s1, 0xffff
782; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
783; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s1, s0, v0
784; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
785; GFX10-DL-NEXT:    s_endpgm
786                                          <2 x i16> addrspace(1)* %src2,
787                                          i32 addrspace(1)* nocapture %dst) {
788entry:
789  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
790  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
791
792  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
793  %conv = sext i16 %s1.elt1 to i32
794  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
795  %conv2 = zext i16 %s2.elt1 to i32
796  %mul1 = mul nuw i32 %conv2, %conv
797
798  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
799  %conv3 = sext i16 %s1.elt2 to i32
800  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
801  %conv4 = sext i16 %s2.elt2 to i32
802  %mul2 = mul nuw i32 %conv4, %conv3
803
804  %s3 = load i32, i32 addrspace(1)* %dst, align 4
805  %add = add i32 %mul2, %s3
806  %add6 = add i32 %add, %mul1
807  store i32 %add6, i32 addrspace(1)* %dst, align 4
808  ret void
809}
810
811define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
812; GFX7-LABEL: notudot2_SameVec:
813; GFX7:       ; %bb.0: ; %entry
814; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
815; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
816; GFX7-NEXT:    s_mov_b32 s3, 0xf000
817; GFX7-NEXT:    s_mov_b32 s2, -1
818; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
820; GFX7-NEXT:    s_load_dword s7, s[0:1], 0x0
821; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
822; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX7-NEXT:    s_lshr_b32 s5, s6, 16
824; GFX7-NEXT:    v_mov_b32_e32 v0, s7
825; GFX7-NEXT:    s_and_b32 s4, s4, 0xffff
826; GFX7-NEXT:    v_mad_u32_u24 v0, s5, s5, v0
827; GFX7-NEXT:    v_mad_u32_u24 v0, s4, s4, v0
828; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
829; GFX7-NEXT:    s_endpgm
830;
831; GFX8-LABEL: notudot2_SameVec:
832; GFX8:       ; %bb.0: ; %entry
833; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
834; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
835; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
836; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
837; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
838; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
839; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
840; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
841; GFX8-NEXT:    v_mov_b32_e32 v0, s3
842; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
843; GFX8-NEXT:    v_mad_u32_u24 v0, s2, s2, v0
844; GFX8-NEXT:    v_mad_u32_u24 v2, s4, s4, v0
845; GFX8-NEXT:    v_mov_b32_e32 v0, s0
846; GFX8-NEXT:    v_mov_b32_e32 v1, s1
847; GFX8-NEXT:    flat_store_dword v[0:1], v2
848; GFX8-NEXT:    s_endpgm
849;
850; GFX9-NODL-LABEL: notudot2_SameVec:
851; GFX9-NODL:       ; %bb.0: ; %entry
852; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
853; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
854; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
855; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
856; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
857; GFX9-NODL-NEXT:    s_load_dword s3, s[0:1], 0x0
858; GFX9-NODL-NEXT:    s_load_dword s4, s[4:5], 0x0
859; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
860; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
861; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
862; GFX9-NODL-NEXT:    s_and_b32 s4, s4, 0xffff
863; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, s2, v1
864; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, s4, v1
865; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
866; GFX9-NODL-NEXT:    s_endpgm
867;
868; GFX9-DL-LABEL: notudot2_SameVec:
869; GFX9-DL:       ; %bb.0: ; %entry
870; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
871; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
872; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
873; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
874; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
875; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
876; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
877; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 16
879; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
880; GFX9-DL-NEXT:    s_and_b32 s4, s4, 0xffff
881; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, s2, v1
882; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, s4, v1
883; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
884; GFX9-DL-NEXT:    s_endpgm
885;
886; GFX10-DL-LABEL: notudot2_SameVec:
887; GFX10-DL:       ; %bb.0: ; %entry
888; GFX10-DL-NEXT:    s_clause 0x1
889; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
890; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
891; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
892; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
893; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
894; GFX10-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
895; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
896; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
897; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
898; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s2, s3
899; GFX10-DL-NEXT:    s_and_b32 s2, s4, 0xffff
900; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s2, v0
901; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
902; GFX10-DL-NEXT:    s_endpgm
903                                            <2 x i16> addrspace(1)* %src2,
904                                            i32 addrspace(1)* nocapture %dst) {
905entry:
906  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
907  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
908
909  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
910  %conv = zext i16 %s1.elt1 to i32
911  %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
912  %conv2 = zext i16 %s2.elt1 to i32
913  %mul1 = mul i32 %conv2, %conv
914
915  %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
916  %conv3 = zext i16 %s1.elt2 to i32
917  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
918  %conv4 = zext i16 %s2.elt2 to i32
919  %mul2 = mul i32 %conv4, %conv3
920
921  %s3 = load i32, i32 addrspace(1)* %dst, align 4
922  %add = add i32 %mul2, %s3
923  %add6 = add i32 %add, %mul1
924  store i32 %add6, i32 addrspace(1)* %dst, align 4
925  ret void
926}
927
928define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
929; GFX7-LABEL: udot2_v4i16:
930; GFX7:       ; %bb.0: ; %entry
931; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
932; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
933; GFX7-NEXT:    s_mov_b32 s8, 0xffff
934; GFX7-NEXT:    s_mov_b32 s3, 0xf000
935; GFX7-NEXT:    s_mov_b32 s2, -1
936; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
937; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
938; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
939; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
940; GFX7-NEXT:    s_and_b32 s6, s4, s8
941; GFX7-NEXT:    s_and_b32 s7, s5, s8
942; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
943; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
944; GFX7-NEXT:    s_lshr_b32 s5, s5, 16
945; GFX7-NEXT:    v_mov_b32_e32 v0, s4
946; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
947; GFX7-NEXT:    v_mov_b32_e32 v1, s8
948; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
949; GFX7-NEXT:    v_mov_b32_e32 v1, s6
950; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
951; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
952; GFX7-NEXT:    s_endpgm
953;
954; GFX8-LABEL: udot2_v4i16:
955; GFX8:       ; %bb.0: ; %entry
956; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
957; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
958; GFX8-NEXT:    s_mov_b32 s2, 0xffff
959; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
961; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
962; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
963; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
964; GFX8-NEXT:    s_and_b32 s6, s3, s2
965; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
966; GFX8-NEXT:    s_and_b32 s2, s4, s2
967; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
968; GFX8-NEXT:    v_mov_b32_e32 v0, s5
969; GFX8-NEXT:    v_mov_b32_e32 v1, s3
970; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
971; GFX8-NEXT:    v_mov_b32_e32 v1, s6
972; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
973; GFX8-NEXT:    v_mov_b32_e32 v0, s0
974; GFX8-NEXT:    v_mov_b32_e32 v1, s1
975; GFX8-NEXT:    flat_store_dword v[0:1], v2
976; GFX8-NEXT:    s_endpgm
977;
978; GFX9-NODL-LABEL: udot2_v4i16:
979; GFX9-NODL:       ; %bb.0: ; %entry
980; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
981; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
982; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
983; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
984; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
985; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
986; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
987; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
988; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
989; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
990; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
991; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
992; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
993; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
994; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
995; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
996; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
997; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
998; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
999; GFX9-NODL-NEXT:    s_endpgm
1000;
1001; GFX9-DL-LABEL: udot2_v4i16:
1002; GFX9-DL:       ; %bb.0: ; %entry
1003; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1004; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1005; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1006; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1008; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
1009; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
1010; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
1012; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
1013; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s4, v1, v2
1014; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1015; GFX9-DL-NEXT:    s_endpgm
1016;
1017; GFX10-DL-LABEL: udot2_v4i16:
1018; GFX10-DL:       ; %bb.0: ; %entry
1019; GFX10-DL-NEXT:    s_clause 0x1
1020; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1021; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1022; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1023; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1025; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1026; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1027; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1029; GFX10-DL-NEXT:    v_dot2_u32_u16 v0, s1, s0, v0
1030; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
1031; GFX10-DL-NEXT:    s_endpgm
1032                                       <4 x i16> addrspace(1)* %src2,
1033                                       i32 addrspace(1)* nocapture %dst) {
1034entry:
1035  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1036  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1037
1038  %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1039  %conv = zext i16 %s1.elt1 to i32
1040  %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1041  %conv2 = zext i16 %s2.elt1 to i32
1042  %mul1 = mul i32 %conv2, %conv
1043
1044  %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
1045  %conv3 = zext i16 %s1.elt2 to i32
1046  %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
1047  %conv4 = zext i16 %s2.elt2 to i32
1048  %mul2 = mul i32 %conv4, %conv3
1049
1050  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1051  %add = add i32 %mul2, %s3
1052  %add6 = add i32 %add, %mul1
1053  store i32 %add6, i32 addrspace(1)* %dst, align 4
1054  ret void
1055}
1056
1057define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
1058; GFX7-LABEL: udot2_v4i16_Hi:
1059; GFX7:       ; %bb.0: ; %entry
1060; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1061; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1062; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1063; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1064; GFX7-NEXT:    s_mov_b32 s2, -1
1065; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1066; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x1
1067; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
1068; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1069; GFX7-NEXT:    s_and_b32 s6, s4, s8
1070; GFX7-NEXT:    s_and_b32 s7, s5, s8
1071; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
1072; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
1073; GFX7-NEXT:    s_lshr_b32 s5, s5, 16
1074; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1075; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1076; GFX7-NEXT:    v_mov_b32_e32 v1, s8
1077; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
1078; GFX7-NEXT:    v_mov_b32_e32 v1, s6
1079; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
1080; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1081; GFX7-NEXT:    s_endpgm
1082;
1083; GFX8-LABEL: udot2_v4i16_Hi:
1084; GFX8:       ; %bb.0: ; %entry
1085; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1086; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1087; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1088; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1089; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x4
1090; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x4
1091; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
1092; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1093; GFX8-NEXT:    s_and_b32 s6, s3, s2
1094; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
1095; GFX8-NEXT:    s_and_b32 s2, s4, s2
1096; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
1097; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1098; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1099; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
1100; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1101; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
1102; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1103; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1104; GFX8-NEXT:    flat_store_dword v[0:1], v2
1105; GFX8-NEXT:    s_endpgm
1106;
1107; GFX9-NODL-LABEL: udot2_v4i16_Hi:
1108; GFX9-NODL:       ; %bb.0: ; %entry
1109; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1110; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1111; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
1112; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1113; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x4
1115; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x4
1116; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
1117; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1118; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
1119; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
1120; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
1121; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
1122; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
1123; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
1124; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
1125; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
1126; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
1127; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1128; GFX9-NODL-NEXT:    s_endpgm
1129;
1130; GFX9-DL-LABEL: udot2_v4i16_Hi:
1131; GFX9-DL:       ; %bb.0: ; %entry
1132; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1133; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1134; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1135; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x4
1137; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
1138; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x4
1139; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1140; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
1141; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
1142; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s4, v1, v2
1143; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1144; GFX9-DL-NEXT:    s_endpgm
1145;
1146; GFX10-DL-LABEL: udot2_v4i16_Hi:
1147; GFX10-DL:       ; %bb.0: ; %entry
1148; GFX10-DL-NEXT:    s_clause 0x1
1149; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1150; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1151; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1152; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1153; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1154; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x4
1155; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x4
1156; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1157; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1158; GFX10-DL-NEXT:    v_dot2_u32_u16 v0, s1, s0, v0
1159; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
1160; GFX10-DL-NEXT:    s_endpgm
1161                                          <4 x i16> addrspace(1)* %src2,
1162                                          i32 addrspace(1)* nocapture %dst) {
1163entry:
1164  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1165  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1166
1167  %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
1168  %conv = zext i16 %s1.elt1 to i32
1169  %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
1170  %conv2 = zext i16 %s2.elt1 to i32
1171  %mul1 = mul i32 %conv2, %conv
1172
1173  %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
1174  %conv3 = zext i16 %s1.elt2 to i32
1175  %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
1176  %conv4 = zext i16 %s2.elt2 to i32
1177  %mul2 = mul i32 %conv4, %conv3
1178
1179  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1180  %add = add i32 %mul2, %s3
1181  %add6 = add i32 %add, %mul1
1182  store i32 %add6, i32 addrspace(1)* %dst, align 4
1183  ret void
1184}
1185
1186define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
1187; GFX7-LABEL: notudot2_v4i16_Even:
1188; GFX7:       ; %bb.0: ; %entry
1189; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1190; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1191; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1192; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1193; GFX7-NEXT:    s_mov_b32 s2, -1
1194; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1196; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
1197; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1198; GFX7-NEXT:    s_and_b32 s5, s5, s8
1199; GFX7-NEXT:    s_and_b32 s4, s4, s8
1200; GFX7-NEXT:    s_and_b32 s6, s6, s8
1201; GFX7-NEXT:    s_and_b32 s7, s7, s8
1202; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
1203; GFX7-NEXT:    v_mov_b32_e32 v0, s5
1204; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1205; GFX7-NEXT:    v_mov_b32_e32 v1, s8
1206; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
1207; GFX7-NEXT:    v_mov_b32_e32 v1, s4
1208; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
1209; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1210; GFX7-NEXT:    s_endpgm
1211;
1212; GFX8-LABEL: notudot2_v4i16_Even:
1213; GFX8:       ; %bb.0: ; %entry
1214; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1215; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1216; GFX8-NEXT:    s_mov_b32 s8, 0xffff
1217; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1218; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1219; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1220; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x0
1221; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX8-NEXT:    s_and_b32 s3, s3, s8
1223; GFX8-NEXT:    s_and_b32 s2, s2, s8
1224; GFX8-NEXT:    s_and_b32 s5, s5, s8
1225; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1226; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1227; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
1228; GFX8-NEXT:    s_and_b32 s4, s4, s8
1229; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1230; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
1231; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1232; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1233; GFX8-NEXT:    flat_store_dword v[0:1], v2
1234; GFX8-NEXT:    s_endpgm
1235;
1236; GFX9-NODL-LABEL: notudot2_v4i16_Even:
1237; GFX9-NODL:       ; %bb.0: ; %entry
1238; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1239; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1240; GFX9-NODL-NEXT:    s_mov_b32 s8, 0xffff
1241; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1242; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1243; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1244; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1245; GFX9-NODL-NEXT:    s_load_dword s6, s[0:1], 0x0
1246; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1247; GFX9-NODL-NEXT:    s_and_b32 s3, s3, s8
1248; GFX9-NODL-NEXT:    s_and_b32 s2, s2, s8
1249; GFX9-NODL-NEXT:    s_and_b32 s5, s5, s8
1250; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
1251; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
1252; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
1253; GFX9-NODL-NEXT:    s_and_b32 s4, s4, s8
1254; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
1255; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1256; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1257; GFX9-NODL-NEXT:    s_endpgm
1258;
1259; GFX9-DL-LABEL: notudot2_v4i16_Even:
1260; GFX9-DL:       ; %bb.0: ; %entry
1261; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1262; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1263; GFX9-DL-NEXT:    s_mov_b32 s8, 0xffff
1264; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1265; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1267; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1268; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
1269; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX9-DL-NEXT:    s_and_b32 s3, s3, s8
1271; GFX9-DL-NEXT:    s_and_b32 s2, s2, s8
1272; GFX9-DL-NEXT:    s_and_b32 s5, s5, s8
1273; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
1274; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
1275; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
1276; GFX9-DL-NEXT:    s_and_b32 s4, s4, s8
1277; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
1278; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1279; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1280; GFX9-DL-NEXT:    s_endpgm
1281;
1282; GFX10-DL-LABEL: notudot2_v4i16_Even:
1283; GFX10-DL:       ; %bb.0: ; %entry
1284; GFX10-DL-NEXT:    s_clause 0x1
1285; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1286; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1287; GFX10-DL-NEXT:    s_mov_b32 s7, 0xffff
1288; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1289; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1290; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1291; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1292; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1293; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1295; GFX10-DL-NEXT:    s_and_b32 s1, s1, s7
1296; GFX10-DL-NEXT:    s_and_b32 s3, s3, s7
1297; GFX10-DL-NEXT:    s_and_b32 s0, s0, s7
1298; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s1, v0
1299; GFX10-DL-NEXT:    s_and_b32 s1, s2, s7
1300; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
1301; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
1302; GFX10-DL-NEXT:    s_endpgm
1303                                               <4 x i16> addrspace(1)* %src2,
1304                                               i32 addrspace(1)* nocapture %dst) {
1305entry:
1306  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1307  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1308
1309  %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1310  %conv = zext i16 %s1.elt1 to i32
1311  %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1312  %conv2 = zext i16 %s2.elt1 to i32
1313  %mul1 = mul i32 %conv2, %conv
1314
1315  %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1316  %conv3 = zext i16 %s1.elt2 to i32
1317  %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1318  %conv4 = zext i16 %s2.elt2 to i32
1319  %mul2 = mul i32 %conv4, %conv3
1320
1321  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1322  %add = add i32 %mul2, %s3
1323  %add6 = add i32 %add, %mul1
1324  store i32 %add6, i32 addrspace(1)* %dst, align 4
1325  ret void
1326}
1327
1328define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
1329; GFX7-LABEL: notudot2_v4i16_Middle:
1330; GFX7:       ; %bb.0: ; %entry
1331; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1332; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1333; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1334; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1335; GFX7-NEXT:    s_mov_b32 s2, -1
1336; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1337; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1338; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
1339; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1340; GFX7-NEXT:    s_and_b32 s5, s5, s8
1341; GFX7-NEXT:    s_and_b32 s7, s7, s8
1342; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
1343; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
1344; GFX7-NEXT:    v_mov_b32_e32 v0, s5
1345; GFX7-NEXT:    s_lshr_b32 s6, s6, 16
1346; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1347; GFX7-NEXT:    v_mov_b32_e32 v1, s8
1348; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
1349; GFX7-NEXT:    v_mov_b32_e32 v1, s4
1350; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
1351; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1352; GFX7-NEXT:    s_endpgm
1353;
1354; GFX8-LABEL: notudot2_v4i16_Middle:
1355; GFX8:       ; %bb.0: ; %entry
1356; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1357; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1358; GFX8-NEXT:    s_mov_b32 s8, 0xffff
1359; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1360; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1361; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1362; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x0
1363; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX8-NEXT:    s_and_b32 s3, s3, s8
1365; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
1366; GFX8-NEXT:    s_and_b32 s5, s5, s8
1367; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1368; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1369; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
1370; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
1371; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1372; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
1373; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1374; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1375; GFX8-NEXT:    flat_store_dword v[0:1], v2
1376; GFX8-NEXT:    s_endpgm
1377;
1378; GFX9-NODL-LABEL: notudot2_v4i16_Middle:
1379; GFX9-NODL:       ; %bb.0: ; %entry
1380; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1381; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1382; GFX9-NODL-NEXT:    s_mov_b32 s8, 0xffff
1383; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1384; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1386; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1387; GFX9-NODL-NEXT:    s_load_dword s6, s[0:1], 0x0
1388; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX9-NODL-NEXT:    s_and_b32 s3, s3, s8
1390; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
1391; GFX9-NODL-NEXT:    s_and_b32 s5, s5, s8
1392; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
1393; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
1394; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
1395; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
1396; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
1397; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1398; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1399; GFX9-NODL-NEXT:    s_endpgm
1400;
1401; GFX9-DL-LABEL: notudot2_v4i16_Middle:
1402; GFX9-DL:       ; %bb.0: ; %entry
1403; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1404; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1405; GFX9-DL-NEXT:    s_mov_b32 s8, 0xffff
1406; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1407; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1408; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1409; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1410; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
1411; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1412; GFX9-DL-NEXT:    s_and_b32 s3, s3, s8
1413; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 16
1414; GFX9-DL-NEXT:    s_and_b32 s5, s5, s8
1415; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
1416; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
1417; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
1418; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
1419; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
1420; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
1421; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1422; GFX9-DL-NEXT:    s_endpgm
1423;
1424; GFX10-DL-LABEL: notudot2_v4i16_Middle:
1425; GFX10-DL:       ; %bb.0: ; %entry
1426; GFX10-DL-NEXT:    s_clause 0x1
1427; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1428; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1429; GFX10-DL-NEXT:    s_mov_b32 s7, 0xffff
1430; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1431; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1432; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1433; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1434; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1435; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1436; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1437; GFX10-DL-NEXT:    s_and_b32 s1, s1, s7
1438; GFX10-DL-NEXT:    s_and_b32 s3, s3, s7
1439; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 16
1440; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s1, v0
1441; GFX10-DL-NEXT:    s_lshr_b32 s1, s2, 16
1442; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
1443; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
1444; GFX10-DL-NEXT:    s_endpgm
1445                                                 <4 x i16> addrspace(1)* %src2,
1446                                                 i32 addrspace(1)* nocapture %dst) {
1447entry:
1448  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1449  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1450
1451  %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
1452  %conv = zext i16 %s1.elt1 to i32
1453  %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
1454  %conv2 = zext i16 %s2.elt1 to i32
1455  %mul1 = mul i32 %conv2, %conv
1456
1457  %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1458  %conv3 = zext i16 %s1.elt2 to i32
1459  %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1460  %conv4 = zext i16 %s2.elt2 to i32
1461  %mul2 = mul i32 %conv4, %conv3
1462
1463  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1464  %add = add i32 %mul2, %s3
1465  %add6 = add i32 %add, %mul1
1466  store i32 %add6, i32 addrspace(1)* %dst, align 4
1467  ret void
1468}
1469
1470define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
1471; GFX7-LABEL: notudot2_DiffIndex:
1472; GFX7:       ; %bb.0: ; %entry
1473; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1474; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1475; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1476; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1477; GFX7-NEXT:    s_mov_b32 s2, -1
1478; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1479; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1480; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1481; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1482; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
1483; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
1484; GFX7-NEXT:    s_and_b32 s4, s4, s8
1485; GFX7-NEXT:    s_and_b32 s5, s5, s8
1486; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
1487; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1488; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1489; GFX7-NEXT:    v_mov_b32_e32 v1, s8
1490; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
1491; GFX7-NEXT:    v_mov_b32_e32 v1, s4
1492; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
1493; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1494; GFX7-NEXT:    s_endpgm
1495;
1496; GFX8-LABEL: notudot2_DiffIndex:
1497; GFX8:       ; %bb.0: ; %entry
1498; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1499; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1500; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1501; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1502; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
1503; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
1504; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
1505; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1506; GFX8-NEXT:    s_and_b32 s6, s3, s2
1507; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
1508; GFX8-NEXT:    s_and_b32 s2, s4, s2
1509; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1510; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1511; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
1512; GFX8-NEXT:    s_lshr_b32 s7, s4, 16
1513; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1514; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v1, v0
1515; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1516; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1517; GFX8-NEXT:    flat_store_dword v[0:1], v2
1518; GFX8-NEXT:    s_endpgm
1519;
1520; GFX9-NODL-LABEL: notudot2_DiffIndex:
1521; GFX9-NODL:       ; %bb.0: ; %entry
1522; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1523; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1524; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
1525; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1526; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1527; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
1528; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
1529; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
1530; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1531; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
1532; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
1533; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
1534; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
1535; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
1536; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
1537; GFX9-NODL-NEXT:    s_lshr_b32 s7, s4, 16
1538; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
1539; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
1540; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1541; GFX9-NODL-NEXT:    s_endpgm
1542;
1543; GFX9-DL-LABEL: notudot2_DiffIndex:
1544; GFX9-DL:       ; %bb.0: ; %entry
1545; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1546; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1547; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
1548; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1549; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1550; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
1551; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
1552; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
1553; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1554; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
1555; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
1556; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
1557; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
1558; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
1559; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
1560; GFX9-DL-NEXT:    s_lshr_b32 s7, s4, 16
1561; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
1562; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
1563; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1564; GFX9-DL-NEXT:    s_endpgm
1565;
1566; GFX10-DL-LABEL: notudot2_DiffIndex:
1567; GFX10-DL:       ; %bb.0: ; %entry
1568; GFX10-DL-NEXT:    s_clause 0x1
1569; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1570; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1571; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1572; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1573; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1574; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1575; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1576; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
1577; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1578; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1579; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 16
1580; GFX10-DL-NEXT:    s_and_b32 s6, s1, s2
1581; GFX10-DL-NEXT:    s_and_b32 s0, s0, s2
1582; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 16
1583; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s3, v0
1584; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
1585; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
1586; GFX10-DL-NEXT:    s_endpgm
1587                                              <2 x i16> addrspace(1)* %src2,
1588                                              i32 addrspace(1)* nocapture %dst) {
1589entry:
1590  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1591  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1592
1593  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1594  %conv = zext i16 %s1.elt1 to i32
1595  %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1596  %conv2 = zext i16 %s2.elt1 to i32
1597  %mul1 = mul i32 %conv2, %conv
1598
1599  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1600  %conv3 = zext i16 %s1.elt2 to i32
1601  %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1602  %conv4 = zext i16 %s2.elt2 to i32
1603  %mul2 = mul i32 %conv4, %conv3
1604
1605  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1606  %add = add i32 %mul2, %s3
1607  %add6 = add i32 %add, %mul1
1608  store i32 %add6, i32 addrspace(1)* %dst, align 4
1609  ret void
1610}
1611
1612define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1613; GFX7-LABEL: udot2_MultipleUses_add1:
1614; GFX7:       ; %bb.0: ; %entry
1615; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1616; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1617; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1618; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1619; GFX7-NEXT:    s_mov_b32 s2, -1
1620; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1621; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1622; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1623; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1624; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
1625; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
1626; GFX7-NEXT:    s_and_b32 s4, s4, s8
1627; GFX7-NEXT:    s_and_b32 s5, s5, s8
1628; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
1629; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1630; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1631; GFX7-NEXT:    v_mov_b32_e32 v1, s8
1632; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
1633; GFX7-NEXT:    v_mov_b32_e32 v1, s4
1634; GFX7-NEXT:    v_mad_u32_u24 v1, s5, v1, v0
1635; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1636; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1637; GFX7-NEXT:    s_endpgm
1638;
1639; GFX8-LABEL: udot2_MultipleUses_add1:
1640; GFX8:       ; %bb.0: ; %entry
1641; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1642; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1643; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1644; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1645; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
1646; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
1647; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
1648; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1649; GFX8-NEXT:    s_and_b32 s6, s3, s2
1650; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
1651; GFX8-NEXT:    s_and_b32 s2, s4, s2
1652; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
1653; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1654; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1655; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
1656; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1657; GFX8-NEXT:    v_mad_u32_u24 v1, s2, v1, v0
1658; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
1659; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1660; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1661; GFX8-NEXT:    flat_store_dword v[0:1], v2
1662; GFX8-NEXT:    s_endpgm
1663;
1664; GFX9-NODL-LABEL: udot2_MultipleUses_add1:
1665; GFX9-NODL:       ; %bb.0: ; %entry
1666; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1667; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1668; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
1669; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1670; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1671; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
1672; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
1673; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
1674; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1675; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
1676; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
1677; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
1678; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
1679; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
1680; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
1681; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
1682; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
1683; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v2, v1
1684; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v2, v1
1685; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1686; GFX9-NODL-NEXT:    s_endpgm
1687;
1688; GFX9-DL-LABEL: udot2_MultipleUses_add1:
1689; GFX9-DL:       ; %bb.0: ; %entry
1690; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1691; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1692; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
1693; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1694; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
1696; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
1697; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
1698; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
1700; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
1701; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
1702; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
1703; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
1704; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
1705; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
1706; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
1707; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v2, v1
1708; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
1709; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1710; GFX9-DL-NEXT:    s_endpgm
1711;
1712; GFX10-DL-LABEL: udot2_MultipleUses_add1:
1713; GFX10-DL:       ; %bb.0: ; %entry
1714; GFX10-DL-NEXT:    s_clause 0x1
1715; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1716; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1717; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1718; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1720; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1721; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1722; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1723; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1724; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 16
1725; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 16
1726; GFX10-DL-NEXT:    s_mov_b32 s6, 0xffff
1727; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
1728; GFX10-DL-NEXT:    s_and_b32 s0, s0, s6
1729; GFX10-DL-NEXT:    s_and_b32 s1, s1, s6
1730; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s1, s0, v0
1731; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
1732; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
1733; GFX10-DL-NEXT:    s_endpgm
1734                                                   <2 x i16> addrspace(1)* %src2,
1735                                                   i32 addrspace(1)* nocapture %dst) {
1736entry:
1737  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1738  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1739
1740  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1741  %conv = zext i16 %s1.elt1 to i32
1742  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1743  %conv2 = zext i16 %s2.elt1 to i32
1744  %mul1 = mul i32 %conv2, %conv
1745
1746  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1747  %conv3 = zext i16 %s1.elt2 to i32
1748  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1749  %conv4 = zext i16 %s2.elt2 to i32
1750  %mul2 = mul i32 %conv4, %conv3
1751
1752  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1753  %add1 = add i32 %mul2, %s3
1754  %add2 = add i32 %add1, %mul1
1755
1756  %res = add i32 %add2, %add1
1757  store i32 %res, i32 addrspace(1)* %dst, align 4
1758  ret void
1759}
1760
1761define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1762; GFX7-LABEL: idot2_MultipleUses_add1:
1763; GFX7:       ; %bb.0: ; %entry
1764; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1765; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1766; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1767; GFX7-NEXT:    s_mov_b32 s2, -1
1768; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1769; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1770; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1771; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
1772; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1773; GFX7-NEXT:    s_sext_i32_i16 s7, s4
1774; GFX7-NEXT:    s_ashr_i32 s4, s4, 16
1775; GFX7-NEXT:    s_sext_i32_i16 s8, s5
1776; GFX7-NEXT:    s_ashr_i32 s5, s5, 16
1777; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1778; GFX7-NEXT:    v_mov_b32_e32 v1, s6
1779; GFX7-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
1780; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1781; GFX7-NEXT:    v_mad_i32_i24 v1, s8, v1, v0
1782; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1783; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1784; GFX7-NEXT:    s_endpgm
1785;
1786; GFX8-LABEL: idot2_MultipleUses_add1:
1787; GFX8:       ; %bb.0: ; %entry
1788; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1789; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1790; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1791; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
1792; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
1793; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
1794; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1795; GFX8-NEXT:    s_sext_i32_i16 s5, s2
1796; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
1797; GFX8-NEXT:    s_sext_i32_i16 s6, s3
1798; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
1799; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1800; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1801; GFX8-NEXT:    v_mov_b32_e32 v2, s5
1802; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
1803; GFX8-NEXT:    v_mad_i32_i24 v1, s6, v2, v0
1804; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
1805; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1806; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1807; GFX8-NEXT:    flat_store_dword v[0:1], v2
1808; GFX8-NEXT:    s_endpgm
1809;
1810; GFX9-NODL-LABEL: idot2_MultipleUses_add1:
1811; GFX9-NODL:       ; %bb.0: ; %entry
1812; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1813; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1814; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1815; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1816; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
1817; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
1818; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
1819; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
1821; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
1822; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
1823; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
1824; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
1825; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
1826; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
1827; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
1828; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v1
1829; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v2, v1
1830; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1831; GFX9-NODL-NEXT:    s_endpgm
1832;
1833; GFX9-DL-LABEL: idot2_MultipleUses_add1:
1834; GFX9-DL:       ; %bb.0: ; %entry
1835; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1836; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1837; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1838; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1839; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
1840; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
1841; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
1842; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1843; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
1844; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
1845; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
1846; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
1847; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
1848; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
1849; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
1850; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
1851; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v2, v1
1852; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
1853; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1854; GFX9-DL-NEXT:    s_endpgm
1855;
1856; GFX10-DL-LABEL: idot2_MultipleUses_add1:
1857; GFX10-DL:       ; %bb.0: ; %entry
1858; GFX10-DL-NEXT:    s_clause 0x1
1859; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1860; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1861; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1862; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1863; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1864; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1865; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1866; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1867; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1868; GFX10-DL-NEXT:    s_ashr_i32 s2, s0, 16
1869; GFX10-DL-NEXT:    s_ashr_i32 s3, s1, 16
1870; GFX10-DL-NEXT:    s_sext_i32_i16 s0, s0
1871; GFX10-DL-NEXT:    s_sext_i32_i16 s1, s1
1872; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
1873; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s1, s0, v0
1874; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
1875; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
1876; GFX10-DL-NEXT:    s_endpgm
1877                                                   <2 x i16> addrspace(1)* %src2,
1878                                                   i32 addrspace(1)* nocapture %dst) {
1879entry:
1880  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1881  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1882
1883  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1884  %conv = sext i16 %s1.elt1 to i32
1885  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1886  %conv2 = sext i16 %s2.elt1 to i32
1887  %mul1 = mul i32 %conv2, %conv
1888
1889  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1890  %conv3 = sext i16 %s1.elt2 to i32
1891  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1892  %conv4 = sext i16 %s2.elt2 to i32
1893  %mul2 = mul i32 %conv4, %conv3
1894
1895  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1896  %add1 = add i32 %mul2, %s3
1897  %add2 = add i32 %add1, %mul1
1898
1899  %res = add i32 %add2, %add1
1900  store i32 %res, i32 addrspace(1)* %dst, align 4
1901  ret void
1902}
1903
1904define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
1905; GFX7-LABEL: udot2_MultipleUses_mul1:
1906; GFX7:       ; %bb.0: ; %entry
1907; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1908; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1909; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1910; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1911; GFX7-NEXT:    s_mov_b32 s2, -1
1912; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1913; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
1914; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
1915; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1916; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
1917; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
1918; GFX7-NEXT:    s_and_b32 s4, s4, s8
1919; GFX7-NEXT:    s_and_b32 s5, s5, s8
1920; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
1921; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1922; GFX7-NEXT:    v_mov_b32_e32 v2, s6
1923; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1924; GFX7-NEXT:    v_mov_b32_e32 v1, s8
1925; GFX7-NEXT:    v_mad_u32_u24 v1, s5, v0, v1
1926; GFX7-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
1927; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
1928; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1929; GFX7-NEXT:    s_endpgm
1930;
1931; GFX8-LABEL: udot2_MultipleUses_mul1:
1932; GFX8:       ; %bb.0: ; %entry
1933; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1934; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1935; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1936; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1937; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
1938; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
1939; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
1940; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1941; GFX8-NEXT:    s_and_b32 s6, s3, s2
1942; GFX8-NEXT:    s_and_b32 s2, s4, s2
1943; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
1944; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1945; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1946; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
1947; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
1948; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1949; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v2, v0
1950; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
1951; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1952; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1953; GFX8-NEXT:    flat_store_dword v[0:1], v2
1954; GFX8-NEXT:    s_endpgm
1955;
1956; GFX9-NODL-LABEL: udot2_MultipleUses_mul1:
1957; GFX9-NODL:       ; %bb.0: ; %entry
1958; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1959; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1960; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
1961; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1962; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1963; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
1964; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
1965; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
1966; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1967; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
1968; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
1969; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
1970; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
1971; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
1972; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
1973; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v1, v2
1974; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
1975; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
1976; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
1977; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
1978; GFX9-NODL-NEXT:    s_endpgm
1979;
1980; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
1981; GFX9-DL:       ; %bb.0: ; %entry
1982; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1983; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1984; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
1985; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1986; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1987; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
1988; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
1989; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
1990; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1991; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
1992; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
1993; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
1994; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
1995; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
1996; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
1997; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v1, v2
1998; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
1999; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
2000; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
2001; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2002; GFX9-DL-NEXT:    s_endpgm
2003;
2004; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
2005; GFX10-DL:       ; %bb.0: ; %entry
2006; GFX10-DL-NEXT:    s_clause 0x1
2007; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2008; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2009; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2010; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2011; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
2012; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2013; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2014; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
2015; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2016; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
2017; GFX10-DL-NEXT:    s_and_b32 s3, s0, s2
2018; GFX10-DL-NEXT:    s_and_b32 s2, s1, s2
2019; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 16
2020; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 16
2021; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
2022; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
2023; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
2024; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
2025; GFX10-DL-NEXT:    s_endpgm
2026                                                   <2 x i16> addrspace(1)* %src2,
2027                                                   i32 addrspace(1)* nocapture %dst) {
2028entry:
2029  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2030  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2031
2032  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2033  %conv = zext i16 %s1.elt1 to i32
2034  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2035  %conv2 = zext i16 %s2.elt1 to i32
2036  %mul1 = mul i32 %conv2, %conv
2037
2038  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2039  %conv3 = zext i16 %s1.elt2 to i32
2040  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2041  %conv4 = zext i16 %s2.elt2 to i32
2042  %mul2 = mul i32 %conv4, %conv3
2043
2044  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2045  %add0 = add i32 %mul1, %s3
2046
2047  %add1 = add i32 %mul2, %add0
2048  %add2 = add i32 %add1, %mul1
2049
2050  store i32 %add2, i32 addrspace(1)* %dst, align 4
2051  ret void
2052}
2053
2054define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
2055; GFX7-LABEL: idot2_MultipleUses_mul1:
2056; GFX7:       ; %bb.0: ; %entry
2057; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2058; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2059; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2060; GFX7-NEXT:    s_mov_b32 s2, -1
2061; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2062; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
2063; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
2064; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
2065; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2066; GFX7-NEXT:    s_sext_i32_i16 s7, s4
2067; GFX7-NEXT:    s_sext_i32_i16 s8, s5
2068; GFX7-NEXT:    s_ashr_i32 s4, s4, 16
2069; GFX7-NEXT:    v_mov_b32_e32 v0, s7
2070; GFX7-NEXT:    v_mov_b32_e32 v1, s6
2071; GFX7-NEXT:    s_ashr_i32 s5, s5, 16
2072; GFX7-NEXT:    v_mad_i32_i24 v1, s8, v0, v1
2073; GFX7-NEXT:    v_mov_b32_e32 v2, s4
2074; GFX7-NEXT:    v_mad_i32_i24 v1, s5, v2, v1
2075; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v0, v1
2076; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2077; GFX7-NEXT:    s_endpgm
2078;
2079; GFX8-LABEL: idot2_MultipleUses_mul1:
2080; GFX8:       ; %bb.0: ; %entry
2081; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2082; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2083; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2084; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
2085; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
2086; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
2087; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2088; GFX8-NEXT:    s_sext_i32_i16 s5, s2
2089; GFX8-NEXT:    s_sext_i32_i16 s6, s3
2090; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
2091; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2092; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2093; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
2094; GFX8-NEXT:    v_mov_b32_e32 v2, s2
2095; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
2096; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v2, v0
2097; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
2098; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2099; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2100; GFX8-NEXT:    flat_store_dword v[0:1], v2
2101; GFX8-NEXT:    s_endpgm
2102;
2103; GFX9-NODL-LABEL: idot2_MultipleUses_mul1:
2104; GFX9-NODL:       ; %bb.0: ; %entry
2105; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2106; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2107; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2108; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2109; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
2110; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
2111; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
2112; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2113; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
2114; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
2115; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
2116; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
2117; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
2118; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
2119; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
2120; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
2121; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v3, v1
2122; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
2123; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
2124; GFX9-NODL-NEXT:    s_endpgm
2125;
2126; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
2127; GFX9-DL:       ; %bb.0: ; %entry
2128; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2129; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2130; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2131; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2132; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
2133; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
2134; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
2135; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2136; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
2137; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
2138; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
2139; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
2140; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
2141; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
2142; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
2143; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
2144; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v3, v1
2145; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
2146; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2147; GFX9-DL-NEXT:    s_endpgm
2148;
2149; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
2150; GFX10-DL:       ; %bb.0: ; %entry
2151; GFX10-DL-NEXT:    s_clause 0x1
2152; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2153; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2154; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2155; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2156; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
2157; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2158; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2159; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
2161; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s0
2162; GFX10-DL-NEXT:    s_sext_i32_i16 s3, s1
2163; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 16
2164; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 16
2165; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
2166; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s1, s0, v0
2167; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
2168; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
2169; GFX10-DL-NEXT:    s_endpgm
2170                                                   <2 x i16> addrspace(1)* %src2,
2171                                                   i32 addrspace(1)* nocapture %dst) {
2172entry:
2173  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2174  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2175
2176  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2177  %conv = sext i16 %s1.elt1 to i32
2178  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2179  %conv2 = sext i16 %s2.elt1 to i32
2180  %mul1 = mul i32 %conv2, %conv
2181
2182  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2183  %conv3 = sext i16 %s1.elt2 to i32
2184  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2185  %conv4 = sext i16 %s2.elt2 to i32
2186  %mul2 = mul i32 %conv4, %conv3
2187
2188  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2189  %add0 = add i32 %mul1, %s3
2190
2191  %add1 = add i32 %mul2, %add0
2192  %add2 = add i32 %add1, %mul1
2193
2194  store i32 %add2, i32 addrspace(1)* %dst, align 4
2195  ret void
2196}
2197
2198define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
2199; GFX7-LABEL: udot2_MultipleUses_mul2:
2200; GFX7:       ; %bb.0: ; %entry
2201; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2202; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2203; GFX7-NEXT:    s_mov_b32 s8, 0xffff
2204; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2205; GFX7-NEXT:    s_mov_b32 s2, -1
2206; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2207; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
2208; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
2209; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2210; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
2211; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
2212; GFX7-NEXT:    s_and_b32 s4, s4, s8
2213; GFX7-NEXT:    s_and_b32 s5, s5, s8
2214; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
2215; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2216; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2217; GFX7-NEXT:    v_mov_b32_e32 v1, s8
2218; GFX7-NEXT:    v_mad_u32_u24 v1, s7, v0, v1
2219; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
2220; GFX7-NEXT:    v_mov_b32_e32 v1, s4
2221; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
2222; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2223; GFX7-NEXT:    s_endpgm
2224;
2225; GFX8-LABEL: udot2_MultipleUses_mul2:
2226; GFX8:       ; %bb.0: ; %entry
2227; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2228; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2229; GFX8-NEXT:    s_mov_b32 s2, 0xffff
2230; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2231; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
2232; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
2233; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
2234; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2235; GFX8-NEXT:    s_and_b32 s6, s3, s2
2236; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
2237; GFX8-NEXT:    s_and_b32 s2, s4, s2
2238; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
2239; GFX8-NEXT:    v_mov_b32_e32 v0, s5
2240; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2241; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
2242; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
2243; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2244; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
2245; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2246; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2247; GFX8-NEXT:    flat_store_dword v[0:1], v2
2248; GFX8-NEXT:    s_endpgm
2249;
2250; GFX9-NODL-LABEL: udot2_MultipleUses_mul2:
2251; GFX9-NODL:       ; %bb.0: ; %entry
2252; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2253; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2254; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
2255; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2256; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2257; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
2258; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
2259; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
2260; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2261; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
2262; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
2263; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
2264; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
2265; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
2266; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
2267; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v1, v2
2268; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
2269; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
2270; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
2271; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
2272; GFX9-NODL-NEXT:    s_endpgm
2273;
2274; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
2275; GFX9-DL:       ; %bb.0: ; %entry
2276; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2277; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2278; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
2279; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2280; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2281; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
2282; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
2283; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
2284; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2285; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
2286; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
2287; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
2288; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
2289; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
2290; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
2291; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v1, v2
2292; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
2293; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
2294; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
2295; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2296; GFX9-DL-NEXT:    s_endpgm
2297;
2298; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
2299; GFX10-DL:       ; %bb.0: ; %entry
2300; GFX10-DL-NEXT:    s_clause 0x1
2301; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2302; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2303; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2304; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2305; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
2306; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2307; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2308; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2309; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
2310; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 16
2311; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 16
2312; GFX10-DL-NEXT:    s_mov_b32 s6, 0xffff
2313; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
2314; GFX10-DL-NEXT:    s_and_b32 s0, s0, s6
2315; GFX10-DL-NEXT:    s_and_b32 s1, s1, s6
2316; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
2317; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
2318; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
2319; GFX10-DL-NEXT:    s_endpgm
2320                                                   <2 x i16> addrspace(1)* %src2,
2321                                                   i32 addrspace(1)* nocapture %dst) {
2322entry:
2323  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2324  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2325
2326  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2327  %conv = zext i16 %s1.elt1 to i32
2328  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2329  %conv2 = zext i16 %s2.elt1 to i32
2330  %mul1 = mul i32 %conv2, %conv
2331
2332  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2333  %conv3 = zext i16 %s1.elt2 to i32
2334  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2335  %conv4 = zext i16 %s2.elt2 to i32
2336  %mul2 = mul i32 %conv4, %conv3
2337
2338  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2339  %add0 = add i32 %mul2, %s3
2340
2341  %add1 = add i32 %mul2, %add0
2342  %add2 = add i32 %add1, %mul1
2343
2344  store i32 %add2, i32 addrspace(1)* %dst, align 4
2345  ret void
2346}
2347
2348define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
2349; GFX7-LABEL: idot2_MultipleUses_mul2:
2350; GFX7:       ; %bb.0: ; %entry
2351; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2352; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2353; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2354; GFX7-NEXT:    s_mov_b32 s2, -1
2355; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2356; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
2357; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
2358; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
2359; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2360; GFX7-NEXT:    s_sext_i32_i16 s7, s4
2361; GFX7-NEXT:    s_ashr_i32 s4, s4, 16
2362; GFX7-NEXT:    s_sext_i32_i16 s8, s5
2363; GFX7-NEXT:    s_ashr_i32 s5, s5, 16
2364; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2365; GFX7-NEXT:    v_mov_b32_e32 v1, s6
2366; GFX7-NEXT:    v_mad_i32_i24 v1, s5, v0, v1
2367; GFX7-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
2368; GFX7-NEXT:    v_mov_b32_e32 v1, s7
2369; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
2370; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2371; GFX7-NEXT:    s_endpgm
2372;
2373; GFX8-LABEL: idot2_MultipleUses_mul2:
2374; GFX8:       ; %bb.0: ; %entry
2375; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2376; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2377; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2378; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
2379; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
2380; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
2381; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2382; GFX8-NEXT:    s_sext_i32_i16 s5, s2
2383; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
2384; GFX8-NEXT:    s_sext_i32_i16 s6, s3
2385; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
2386; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2387; GFX8-NEXT:    v_mov_b32_e32 v1, s2
2388; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
2389; GFX8-NEXT:    v_mov_b32_e32 v2, s5
2390; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
2391; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
2392; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2393; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2394; GFX8-NEXT:    flat_store_dword v[0:1], v2
2395; GFX8-NEXT:    s_endpgm
2396;
2397; GFX9-NODL-LABEL: idot2_MultipleUses_mul2:
2398; GFX9-NODL:       ; %bb.0: ; %entry
2399; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2400; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2401; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2402; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2403; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
2404; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
2405; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
2406; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2407; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
2408; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
2409; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
2410; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
2411; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
2412; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
2413; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
2414; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
2415; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
2416; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
2417; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
2418; GFX9-NODL-NEXT:    s_endpgm
2419;
2420; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
2421; GFX9-DL:       ; %bb.0: ; %entry
2422; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2423; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2424; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2425; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2426; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
2427; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
2428; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
2429; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2430; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
2431; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
2432; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
2433; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
2434; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
2435; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
2436; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
2437; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
2438; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
2439; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
2440; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2441; GFX9-DL-NEXT:    s_endpgm
2442;
2443; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
2444; GFX10-DL:       ; %bb.0: ; %entry
2445; GFX10-DL-NEXT:    s_clause 0x1
2446; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2447; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2448; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2449; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2450; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
2451; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2452; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2453; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2454; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
2455; GFX10-DL-NEXT:    s_ashr_i32 s2, s0, 16
2456; GFX10-DL-NEXT:    s_ashr_i32 s3, s1, 16
2457; GFX10-DL-NEXT:    s_sext_i32_i16 s0, s0
2458; GFX10-DL-NEXT:    s_sext_i32_i16 s1, s1
2459; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
2460; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
2461; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s1, s0, v0
2462; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
2463; GFX10-DL-NEXT:    s_endpgm
2464                                                   <2 x i16> addrspace(1)* %src2,
2465                                                   i32 addrspace(1)* nocapture %dst) {
2466entry:
2467  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2468  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2469
2470  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2471  %conv = sext i16 %s1.elt1 to i32
2472  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2473  %conv2 = sext i16 %s2.elt1 to i32
2474  %mul1 = mul i32 %conv2, %conv
2475
2476  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2477  %conv3 = sext i16 %s1.elt2 to i32
2478  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2479  %conv4 = sext i16 %s2.elt2 to i32
2480  %mul2 = mul i32 %conv4, %conv3
2481
2482  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2483  %add0 = add i32 %mul2, %s3
2484
2485  %add1 = add i32 %mul2, %add0
2486  %add2 = add i32 %add1, %mul1
2487
2488  store i32 %add2, i32 addrspace(1)* %dst, align 4
2489  ret void
2490}
2491
2492define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
2493; GFX7-LABEL: udot2_acc16:
2494; GFX7:       ; %bb.0: ; %entry
2495; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2496; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2497; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2498; GFX7-NEXT:    s_mov_b32 s2, -1
2499; GFX7-NEXT:    s_mov_b32 s8, 0xffff
2500; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2501; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
2502; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
2503; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
2504; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2505; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
2506; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
2507; GFX7-NEXT:    v_mov_b32_e32 v1, s7
2508; GFX7-NEXT:    s_and_b32 s5, s5, s8
2509; GFX7-NEXT:    s_and_b32 s4, s4, s8
2510; GFX7-NEXT:    s_waitcnt vmcnt(0)
2511; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
2512; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2513; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
2514; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2515; GFX7-NEXT:    s_endpgm
2516;
2517; GFX8-LABEL: udot2_acc16:
2518; GFX8:       ; %bb.0: ; %entry
2519; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2520; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2521; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2522; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2523; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2524; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
2525; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
2526; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
2527; GFX8-NEXT:    s_mov_b32 s0, 0xffff
2528; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2529; GFX8-NEXT:    s_and_b32 s3, s2, s0
2530; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
2531; GFX8-NEXT:    s_and_b32 s0, s1, s0
2532; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
2533; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2534; GFX8-NEXT:    s_waitcnt vmcnt(0)
2535; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
2536; GFX8-NEXT:    v_mov_b32_e32 v3, s3
2537; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
2538; GFX8-NEXT:    flat_store_short v[0:1], v2
2539; GFX8-NEXT:    s_endpgm
2540;
2541; GFX9-NODL-LABEL: udot2_acc16:
2542; GFX9-NODL:       ; %bb.0: ; %entry
2543; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2544; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2545; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2546; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
2547; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2548; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
2549; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
2550; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
2551; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2552; GFX9-NODL-NEXT:    s_and_b32 s5, s4, s2
2553; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
2554; GFX9-NODL-NEXT:    s_and_b32 s2, s3, s2
2555; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
2556; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
2557; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2558; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
2559; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
2560; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
2561; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
2562; GFX9-NODL-NEXT:    s_endpgm
2563;
2564; GFX9-DL-LABEL: udot2_acc16:
2565; GFX9-DL:       ; %bb.0: ; %entry
2566; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2567; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2568; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2569; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2570; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
2571; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
2572; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
2573; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2574; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
2575; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2576; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s2, v2, v1
2577; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
2578; GFX9-DL-NEXT:    s_endpgm
2579;
2580; GFX10-DL-LABEL: udot2_acc16:
2581; GFX10-DL:       ; %bb.0: ; %entry
2582; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2583; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2584; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2585; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2586; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2587; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2588; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2589; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2590; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, s0, s1, v1
2591; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
2592; GFX10-DL-NEXT:    s_endpgm
2593                                       <2 x i16> addrspace(1)* %src2,
2594                                       i16 addrspace(1)* nocapture %dst) {
2595entry:
2596  %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2597  %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2598
2599  %v1e1 = extractelement <2 x i16> %v1, i64 0
2600  %v2e1 = extractelement <2 x i16> %v2, i64 0
2601  %mul1 = mul i16 %v1e1, %v2e1
2602
2603  %v1e2 = extractelement <2 x i16> %v1, i64 1
2604  %v2e2 = extractelement <2 x i16> %v2, i64 1
2605  %mul2 = mul i16 %v1e2, %v2e2
2606
2607  %s2 = load i16, i16 addrspace(1)* %dst, align 2
2608  %add1 = add i16 %mul2, %s2
2609  %add2 = add i16 %add1, %mul1
2610  store i16 %add2, i16 addrspace(1)* %dst, align 2
2611  ret void
2612}
2613
2614define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
2615; GFX7-LABEL: notsdot2_sext8:
2616; GFX7:       ; %bb.0: ; %entry
2617; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2618; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2619; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2620; GFX7-NEXT:    s_mov_b32 s2, -1
2621; GFX7-NEXT:    s_mov_b32 s10, s2
2622; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2623; GFX7-NEXT:    s_mov_b32 s0, s4
2624; GFX7-NEXT:    s_mov_b32 s1, s5
2625; GFX7-NEXT:    s_mov_b32 s4, s6
2626; GFX7-NEXT:    s_mov_b32 s5, s7
2627; GFX7-NEXT:    s_mov_b32 s6, s2
2628; GFX7-NEXT:    s_mov_b32 s7, s3
2629; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
2630; GFX7-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
2631; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
2632; GFX7-NEXT:    s_mov_b32 s11, s3
2633; GFX7-NEXT:    s_waitcnt vmcnt(1)
2634; GFX7-NEXT:    v_bfe_i32 v2, v0, 0, 8
2635; GFX7-NEXT:    s_waitcnt vmcnt(0)
2636; GFX7-NEXT:    v_bfe_i32 v3, v1, 0, 8
2637; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
2638; GFX7-NEXT:    v_bfe_i32 v1, v1, 8, 8
2639; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2640; GFX7-NEXT:    v_mad_i32_i24 v0, v1, v0, s0
2641; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v2, v0
2642; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
2643; GFX7-NEXT:    s_endpgm
2644;
2645; GFX8-LABEL: notsdot2_sext8:
2646; GFX8:       ; %bb.0: ; %entry
2647; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2648; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2649; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2650; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2651; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2652; GFX8-NEXT:    v_mov_b32_e32 v2, s6
2653; GFX8-NEXT:    v_mov_b32_e32 v3, s7
2654; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
2655; GFX8-NEXT:    flat_load_ushort v1, v[2:3]
2656; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2657; GFX8-NEXT:    s_waitcnt vmcnt(1)
2658; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
2659; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
2660; GFX8-NEXT:    s_waitcnt vmcnt(0)
2661; GFX8-NEXT:    v_bfe_i32 v3, v1, 0, 8
2662; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
2663; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
2664; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
2665; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2666; GFX8-NEXT:    v_mad_i32_i24 v0, v1, v0, s2
2667; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v2, v0
2668; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2669; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2670; GFX8-NEXT:    flat_store_dword v[0:1], v2
2671; GFX8-NEXT:    s_endpgm
2672;
2673; GFX9-NODL-LABEL: notsdot2_sext8:
2674; GFX9-NODL:       ; %bb.0: ; %entry
2675; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2676; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2677; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2678; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2679; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[4:5]
2680; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[6:7]
2681; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
2682; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2683; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
2684; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
2685; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2686; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
2687; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
2688; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
2689; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
2690; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2691; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s2
2692; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
2693; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
2694; GFX9-NODL-NEXT:    s_endpgm
2695;
2696; GFX9-DL-LABEL: notsdot2_sext8:
2697; GFX9-DL:       ; %bb.0: ; %entry
2698; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2699; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2700; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2701; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2702; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2703; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
2704; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2705; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2706; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 8
2707; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
2708; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2709; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 8
2710; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
2711; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
2712; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
2713; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2714; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s2
2715; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
2716; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2717; GFX9-DL-NEXT:    s_endpgm
2718;
2719; GFX10-DL-LABEL: notsdot2_sext8:
2720; GFX10-DL:       ; %bb.0: ; %entry
2721; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2722; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2723; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2724; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2725; GFX10-DL-NEXT:    s_clause 0x1
2726; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2727; GFX10-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
2728; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2729; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2730; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, v1
2731; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2732; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, v2
2733; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
2734; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
2735; GFX10-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
2736; GFX10-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
2737; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2738; GFX10-DL-NEXT:    v_mad_i32_i24 v3, v4, v3, s2
2739; GFX10-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, v3
2740; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2741; GFX10-DL-NEXT:    s_endpgm
2742                                          <2 x i8> addrspace(1)* %src2,
2743                                          i32 addrspace(1)* nocapture %dst) {
2744entry:
2745  %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %src1
2746  %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %src2
2747
2748  %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
2749  %conv = sext i8 %s1.elt1 to i32
2750  %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
2751  %conv2 = sext i8 %s2.elt1 to i32
2752  %mul1 = mul nuw i32 %conv2, %conv
2753
2754  %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
2755  %conv3 = sext i8 %s1.elt2 to i32
2756  %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
2757  %conv4 = sext i8 %s2.elt2 to i32
2758  %mul2 = mul nuw i32 %conv4, %conv3
2759
2760  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2761  %add = add i32 %mul2, %s3
2762  %add6 = add i32 %add, %mul1
2763  store i32 %add6, i32 addrspace(1)* %dst, align 4
2764  ret void
2765}
2766