1; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
5; FIXME: Merge into imm.ll
6
7; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
8; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding
9; GCN: buffer_store_dword [[REG]]
10define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
11  store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
12  ret void
13}
14
15; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
16; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 ; encoding
17; GCN: buffer_store_dword [[REG]]
18define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
19  store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
20  ret void
21}
22
23; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
24; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding
25; GCN: buffer_store_dword [[REG]]
26define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
27  store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
28  ret void
29}
30
31; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
32; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800 ; encoding
33; GCN: buffer_store_dword [[REG]]
34define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
35  store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
36  ret void
37}
38
39; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
40; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800 ; encoding
41; GCN: buffer_store_dword [[REG]]
42define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
43  store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
44  ret void
45}
46
47; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
48; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00 ; encoding
49; GCN: buffer_store_dword [[REG]]
50define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
51  store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
52  ret void
53}
54
55; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
56; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00 ; encoding
57; GCN: buffer_store_dword [[REG]]
58define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
59  store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
60  ret void
61}
62
63; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
64; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000 ; encoding
65; GCN: buffer_store_dword [[REG]]
66define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
67  store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
68  ret void
69}
70
71; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
72; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000 ; encoding
73; GCN: buffer_store_dword [[REG]]
74define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
75  store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
76  ret void
77}
78
79; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
80; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400 ; encoding
81; GCN: buffer_store_dword [[REG]]
82define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
83  store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
84  ret void
85}
86
87; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
88; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400 ; encoding
89; GCN: buffer_store_dword [[REG]]
90define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
91  store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
92  ret void
93}
94
95; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
96; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118 ; encoding
97; GCN: buffer_store_dword [[REG]]
98define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
99  store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
100  ret void
101}
102
103; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
104; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118 ; encoding
105; GCN: buffer_store_dword [[REG]]
106define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
107  store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
108  ret void
109}
110
111; GCN-LABEL: {{^}}store_literal_imm_v2f16:
112; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
113; GCN: buffer_store_dword [[REG]]
114define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
115  store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
116  ret void
117}
118
119; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16:
120; GFX9: s_load_dword [[VAL:s[0-9]+]]
121; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0 ; encoding
122; GFX9: buffer_store_dword [[REG]]
123
124; FIXME: Shouldn't need right shift and SDWA, also extra copy
125; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
126; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
127; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
128; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
129
130; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
131; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0
132; VI: v_or_b32
133; VI: buffer_store_dword
134define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
135  %y = fadd <2 x half> %x, <half 0.0, half 0.0>
136  store <2 x half> %y, <2 x half> addrspace(1)* %out
137  ret void
138}
139
140; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16:
141; GFX10: s_load_dword [[VAL:s[0-9]+]]
142; GFX10: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x0f,0xcc,0x02,0xe0,0x01,0x08]
143; GFX10: buffer_store_dword [[REG]]
144
145; GFX9: s_load_dword [[VAL:s[0-9]+]]
146; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x8f,0xd3,0x04,0xe0,0x01,0x08]
147; GFX9: buffer_store_dword [[REG]]
148
149; FIXME: Shouldn't need right shift and SDWA, also extra copy
150; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
151; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
152; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
153; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
154
155; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
156; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5
157; VI: v_or_b32
158; VI: buffer_store_dword
159define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
160  %y = fadd <2 x half> %x, <half 0.5, half 0.5>
161  store <2 x half> %y, <2 x half> addrspace(1)* %out
162  ret void
163}
164
165; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16:
166; GFX10: s_load_dword [[VAL:s[0-9]+]]
167; GFX10: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x0f,0xcc,0x02,0xe2,0x01,0x08]
168; GFX10: buffer_store_dword [[REG]]
169
170; GFX9: s_load_dword [[VAL:s[0-9]+]]
171; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x8f,0xd3,0x04,0xe2,0x01,0x08]
172; GFX9: buffer_store_dword [[REG]]
173
174; FIXME: Shouldn't need right shift and SDWA, also extra copy
175; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
176; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
177; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
178; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
179
180; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
181; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5
182; VI: v_or_b32
183; VI: buffer_store_dword
184define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
185  %y = fadd <2 x half> %x, <half -0.5, half -0.5>
186  store <2 x half> %y, <2 x half> addrspace(1)* %out
187  ret void
188}
189
190; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16:
191; GFX9: s_load_dword [[VAL:s[0-9]+]]
192; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0] ; encoding
193; GFX9: buffer_store_dword [[REG]]
194
195; FIXME: Shouldn't need right shift and SDWA, also extra copy
196; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
197; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
198; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
199; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
200
201; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
202; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0
203; VI: v_or_b32
204; VI: buffer_store_dword
205define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
206  %y = fadd <2 x half> %x, <half 1.0, half 1.0>
207  store <2 x half> %y, <2 x half> addrspace(1)* %out
208  ret void
209}
210
211; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16:
212; GFX9: s_load_dword [[VAL:s[0-9]+]]
213; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0] ; encoding
214; GFX9: buffer_store_dword [[REG]]
215
216
217; FIXME: Shouldn't need right shift and SDWA, also extra copy
218; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
219; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00
220; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
221; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
222
223; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
224; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0
225; VI: v_or_b32
226; VI: buffer_store_dword
227define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
228  %y = fadd <2 x half> %x, <half -1.0, half -1.0>
229  store <2 x half> %y, <2 x half> addrspace(1)* %out
230  ret void
231}
232
233; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16:
234; GFX9: s_load_dword [[VAL:s[0-9]+]]
235; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0] ; encoding
236; GFX9: buffer_store_dword [[REG]]
237
238; FIXME: Shouldn't need right shift and SDWA, also extra copy
239; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
240; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
241; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
242; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
243
244; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
245; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0
246; VI: v_or_b32
247; VI: buffer_store_dword
248define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
249  %y = fadd <2 x half> %x, <half 2.0, half 2.0>
250  store <2 x half> %y, <2 x half> addrspace(1)* %out
251  ret void
252}
253
254; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16:
255; GFX9: s_load_dword [[VAL:s[0-9]+]]
256; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0] ; encoding
257; GFX9: buffer_store_dword [[REG]]
258
259; FIXME: Shouldn't need right shift and SDWA, also extra copy
260; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
261; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
262; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
263; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
264
265; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
266; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0
267; VI: v_or_b32
268; VI: buffer_store_dword
269define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
270  %y = fadd <2 x half> %x, <half -2.0, half -2.0>
271  store <2 x half> %y, <2 x half> addrspace(1)* %out
272  ret void
273}
274
275; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16:
276; GFX9: s_load_dword [[VAL:s[0-9]+]]
277; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0] ; encoding
278; GFX9: buffer_store_dword [[REG]]
279
280; FIXME: Shouldn't need right shift and SDWA, also extra copy
281; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
282; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
283; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
284; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
285
286; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
287; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0
288; VI: v_or_b32
289; VI: buffer_store_dword
290define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
291  %y = fadd <2 x half> %x, <half 4.0, half 4.0>
292  store <2 x half> %y, <2 x half> addrspace(1)* %out
293  ret void
294}
295
296; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16:
297; GFX9: s_load_dword [[VAL:s[0-9]+]]
298; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0] ; encoding
299; GFX9: buffer_store_dword [[REG]]
300
301; FIXME: Shouldn't need right shift and SDWA, also extra copy
302; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
303; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
304; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
305; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
306
307; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
308; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0
309; VI: v_or_b32
310; VI: buffer_store_dword
311define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
312  %y = fadd <2 x half> %x, <half -4.0, half -4.0>
313  store <2 x half> %y, <2 x half> addrspace(1)* %out
314  ret void
315}
316
317; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16:
318; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
319; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
320; GFX9: buffer_store_dword [[REG]]
321
322; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
323; VI-DAG: buffer_load_dword
324; VI-NOT: and
325; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
326; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
327; VI: v_or_b32
328; VI: buffer_store_dword
329define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
330  %x = load <2 x half>, <2 x half> addrspace(1)* %in
331  %y = fadd <2 x half> %x, <half 0.5, half 0.5>
332  store <2 x half> %y, <2 x half> addrspace(1)* %out
333  ret void
334}
335
336; GCN-LABEL: {{^}}commute_add_literal_v2f16:
337; GFX10: v_pk_add_f16 v0, 0x6400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x00,0x0f,0xcc,0xff,0x00,0x02,0x10,0x00,0x64,0x00,0x00]
338
339; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
340; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding
341; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x8f,0xd3,0x00,0x09,0x00,0x08]
342; GFX9: buffer_store_dword [[REG]]
343
344; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding
345; VI-DAG: buffer_load_dword
346; VI-NOT: and
347; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
348; gfx8 does not support sreg or imm in sdwa - this will be move then
349; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
350; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
351; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
352; VI: buffer_store_dword
353define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
354  %x = load <2 x half>, <2 x half> addrspace(1)* %in
355  %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
356  store <2 x half> %y, <2 x half> addrspace(1)* %out
357  ret void
358}
359
360; GCN-LABEL: {{^}}add_inline_imm_1_v2f16:
361; GFX9: s_load_dword [[VAL:s[0-9]+]]
362; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0] ; encoding
363; GFX9: buffer_store_dword [[REG]]
364
365; FIXME: Shouldn't need right shift and SDWA, also extra copy
366; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
367; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1 ; encoding
368; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
369; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
370
371; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
372; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1 ; encoding
373; VI: v_or_b32
374; VI: buffer_store_dword
375define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
376  %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
377  store <2 x half> %y, <2 x half> addrspace(1)* %out
378  ret void
379}
380
381; GCN-LABEL: {{^}}add_inline_imm_2_v2f16:
382; GFX9: s_load_dword [[VAL:s[0-9]+]]
383; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0] ; encoding
384; GFX9: buffer_store_dword [[REG]]
385
386
387; FIXME: Shouldn't need right shift and SDWA, also extra copy
388; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
389; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2 ; encoding
390; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
391; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
392
393; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
394; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2 ; encoding
395; VI: v_or_b32
396; VI: buffer_store_dword
397define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
398  %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
399  store <2 x half> %y, <2 x half> addrspace(1)* %out
400  ret void
401}
402
403; GCN-LABEL: {{^}}add_inline_imm_16_v2f16:
404; GFX9: s_load_dword [[VAL:s[0-9]+]]
405; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0] ; encoding
406; GFX9: buffer_store_dword [[REG]]
407
408
409; FIXME: Shouldn't need right shift and SDWA, also extra copy
410; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
411; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16 ; encoding
412; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
413; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
414
415; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
416; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16 ; encoding
417; VI: v_or_b32
418; VI: buffer_store_dword
419define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
420  %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
421  store <2 x half> %y, <2 x half> addrspace(1)* %out
422  ret void
423}
424
425; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16:
426; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, -1
427; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
428; GFX9: buffer_store_dword [[REG]]
429
430; VI: s_load_dword [[VAL:s[0-9]+]]
431; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1 ; encoding
432; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
433; VI: buffer_store_dword [[REG]]
434define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
435  %xbc = bitcast <2 x half> %x to i32
436  %y = add i32 %xbc, -1
437  %ybc = bitcast i32 %y to <2 x half>
438  store <2 x half> %ybc, <2 x half> addrspace(1)* %out
439  ret void
440}
441
442; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16:
443; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfffefffe
444; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
445; GFX9: buffer_store_dword [[REG]]
446
447; VI: s_load_dword [[VAL:s[0-9]+]]
448; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe ; encoding
449; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
450; VI: buffer_store_dword [[REG]]
451define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
452  %xbc = bitcast <2 x half> %x to i32
453  %y = add i32 %xbc, 4294901758 ; 0xfffefffe
454  %ybc = bitcast i32 %y to <2 x half>
455  store <2 x half> %ybc, <2 x half> addrspace(1)* %out
456  ret void
457}
458
459; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16:
460; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfff0fff0
461; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
462; GFX9: buffer_store_dword [[REG]]
463
464
465; VI: s_load_dword [[VAL:s[0-9]+]]
466; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0 ; encoding
467; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
468; VI: buffer_store_dword [[REG]]
469define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
470  %xbc = bitcast <2 x half> %x to i32
471  %y = add i32 %xbc, 4293984240 ; 0xfff0fff0
472  %ybc = bitcast i32 %y to <2 x half>
473  store <2 x half> %ybc, <2 x half> addrspace(1)* %out
474  ret void
475}
476
477; GCN-LABEL: {{^}}add_inline_imm_63_v2f16:
478; GFX9: s_load_dword [[VAL:s[0-9]+]]
479; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
480; GFX9: buffer_store_dword [[REG]]
481
482; FIXME: Shouldn't need right shift and SDWA, also extra copy
483; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
484; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
485; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
486; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
487
488; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
489; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63
490; VI: v_or_b32
491; VI: buffer_store_dword
492define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
493  %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
494  store <2 x half> %y, <2 x half> addrspace(1)* %out
495  ret void
496}
497
498; GCN-LABEL: {{^}}add_inline_imm_64_v2f16:
499; GFX9: s_load_dword [[VAL:s[0-9]+]]
500; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
501; GFX9: buffer_store_dword [[REG]]
502
503; FIXME: Shouldn't need right shift and SDWA, also extra copy
504; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
505; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
506; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
507; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
508
509; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
510; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64
511; VI: v_or_b32
512; VI: buffer_store_dword
513define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
514  %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
515  store <2 x half> %y, <2 x half> addrspace(1)* %out
516  ret void
517}
518
519; GCN-LABEL: {{^}}mul_inline_imm_0.5_v2i16:
520; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x38003800
521; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
522
523; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00]
524define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) {
525  %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0.5, half 0.5> to <2 x i16>)
526  ret <2 x i16> %y
527}
528
529; GCN-LABEL: {{^}}mul_inline_imm_neg_0.5_v2i16:
530; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xb800b800
531; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
532
533; GFX10: v_pk_mul_lo_u16 v0, 0xb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0x00]
534define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) {
535  %y = mul <2 x i16> %x, bitcast (<2 x half> <half -0.5, half -0.5> to <2 x i16>)
536  ret <2 x i16> %y
537}
538
539; GCN-LABEL: {{^}}mul_inline_imm_1.0_v2i16:
540; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00
541; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
542
543; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00]
544define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) {
545  %y = mul <2 x i16> %x, bitcast (<2 x half> <half 1.0, half 1.0> to <2 x i16>)
546  ret <2 x i16> %y
547}
548
549; GCN-LABEL: {{^}}mul_inline_imm_neg_1.0_v2i16:
550; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00bc00
551; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
552
553; GFX10: v_pk_mul_lo_u16 v0, 0xbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0x00]
554define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) {
555  %y = mul <2 x i16> %x, bitcast (<2 x half> <half -1.0, half -1.0> to <2 x i16>)
556  ret <2 x i16> %y
557}
558
559; GCN-LABEL: {{^}}shl_inline_imm_2.0_v2i16:
560; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40004000
561; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]]
562
563; GFX10: v_pk_lshlrev_b16 v0, v0, 0x4000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x40,0x00,0x00]
564define <2 x i16> @shl_inline_imm_2.0_v2i16(<2 x i16> %x) {
565  %y = shl <2 x i16> bitcast (<2 x half> <half 2.0, half 2.0> to <2 x i16>), %x
566  ret <2 x i16> %y
567}
568
569; GCN-LABEL: {{^}}shl_inline_imm_neg_2.0_v2i16:
570; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc000c000
571; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]]
572
573; GFX10: v_pk_lshlrev_b16 v0, v0, 0xc000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc0,0x00,0x00]
574define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) {
575  %y = shl <2 x i16> bitcast (<2 x half> <half -2.0, half -2.0> to <2 x i16>), %x
576  ret <2 x i16> %y
577}
578
579; GCN-LABEL: {{^}}mul_inline_imm_4.0_v2i16:
580; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004400
581; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
582
583; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00]
584define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) {
585  %y = mul <2 x i16> %x, bitcast (<2 x half> <half 4.0, half 4.0> to <2 x i16>)
586  ret <2 x i16> %y
587
588}
589
590; GCN-LABEL: {{^}}mul_inline_imm_neg_4.0_v2i16:
591; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc400c400
592; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
593
594; GFX10: v_pk_mul_lo_u16 v0, 0xc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0x00]
595define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) {
596  %y = mul <2 x i16> %x, bitcast (<2 x half> <half -4.0, half -4.0> to <2 x i16>)
597  ret <2 x i16> %y
598}
599
600; GCN-LABEL: {{^}}mul_inline_imm_inv2pi_v2i16:
601; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x31183118
602; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
603
604; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00]
605define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) {
606  %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0xH3118, half 0xH3118> to <2 x i16>)
607  ret <2 x i16> %y
608}
609
610attributes #0 = { nounwind }
611