1; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s 5; FIXME: Merge into imm.ll 6 7; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: 8; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding 9; GCN: buffer_store_dword [[REG]] 10define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 { 11 store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out 12 ret void 13} 14 15; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16: 16; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 ; encoding 17; GCN: buffer_store_dword [[REG]] 18define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 19 store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out 20 ret void 21} 22 23; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16: 24; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding 25; GCN: buffer_store_dword [[REG]] 26define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 27 store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out 28 ret void 29} 30 31; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16: 32; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800 ; encoding 33; GCN: buffer_store_dword [[REG]] 34define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { 35 store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out 36 ret void 37} 38 39; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16: 40; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800 ; encoding 41; GCN: buffer_store_dword [[REG]] 42define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { 43 store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out 44 ret void 45} 46 47; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16: 48; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00 ; encoding 49; GCN: buffer_store_dword [[REG]] 50define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 51 store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out 52 ret void 53} 54 55; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16: 56; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00 ; encoding 57; GCN: buffer_store_dword [[REG]] 58define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 59 store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out 60 ret void 61} 62 63; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16: 64; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000 ; encoding 65; GCN: buffer_store_dword [[REG]] 66define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 67 store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out 68 ret void 69} 70 71; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16: 72; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000 ; encoding 73; GCN: buffer_store_dword [[REG]] 74define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 75 store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out 76 ret void 77} 78 79; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16: 80; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400 ; encoding 81; GCN: buffer_store_dword [[REG]] 82define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 83 store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out 84 ret void 85} 86 87; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16: 88; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400 ; encoding 89; GCN: buffer_store_dword [[REG]] 90define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 91 store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out 92 ret void 93} 94 95; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16: 96; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118 ; encoding 97; GCN: buffer_store_dword [[REG]] 98define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { 99 store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out 100 ret void 101} 102 103; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16: 104; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118 ; encoding 105; GCN: buffer_store_dword [[REG]] 106define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { 107 store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out 108 ret void 109} 110 111; GCN-LABEL: {{^}}store_literal_imm_v2f16: 112; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00 113; GCN: buffer_store_dword [[REG]] 114define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 { 115 store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out 116 ret void 117} 118 119; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16: 120; GFX9: s_load_dword [[VAL:s[0-9]+]] 121; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0 ; encoding 122; GFX9: buffer_store_dword [[REG]] 123 124; FIXME: Shouldn't need right shift and SDWA, also extra copy 125; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 126; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0 127; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 128; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 129 130; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 131; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0 132; VI: v_or_b32 133; VI: buffer_store_dword 134define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 135 %y = fadd <2 x half> %x, <half 0.0, half 0.0> 136 store <2 x half> %y, <2 x half> addrspace(1)* %out 137 ret void 138} 139 140; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16: 141; GFX10: s_load_dword [[VAL:s[0-9]+]] 142; GFX10: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x0f,0xcc,0x02,0xe0,0x01,0x08] 143; GFX10: buffer_store_dword [[REG]] 144 145; GFX9: s_load_dword [[VAL:s[0-9]+]] 146; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x8f,0xd3,0x04,0xe0,0x01,0x08] 147; GFX9: buffer_store_dword [[REG]] 148 149; FIXME: Shouldn't need right shift and SDWA, also extra copy 150; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 151; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 152; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 153; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 154 155; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 156; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5 157; VI: v_or_b32 158; VI: buffer_store_dword 159define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 160 %y = fadd <2 x half> %x, <half 0.5, half 0.5> 161 store <2 x half> %y, <2 x half> addrspace(1)* %out 162 ret void 163} 164 165; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16: 166; GFX10: s_load_dword [[VAL:s[0-9]+]] 167; GFX10: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x0f,0xcc,0x02,0xe2,0x01,0x08] 168; GFX10: buffer_store_dword [[REG]] 169 170; GFX9: s_load_dword [[VAL:s[0-9]+]] 171; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x8f,0xd3,0x04,0xe2,0x01,0x08] 172; GFX9: buffer_store_dword [[REG]] 173 174; FIXME: Shouldn't need right shift and SDWA, also extra copy 175; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 176; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800 177; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 178; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 179 180; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 181; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5 182; VI: v_or_b32 183; VI: buffer_store_dword 184define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 185 %y = fadd <2 x half> %x, <half -0.5, half -0.5> 186 store <2 x half> %y, <2 x half> addrspace(1)* %out 187 ret void 188} 189 190; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16: 191; GFX9: s_load_dword [[VAL:s[0-9]+]] 192; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0] ; encoding 193; GFX9: buffer_store_dword [[REG]] 194 195; FIXME: Shouldn't need right shift and SDWA, also extra copy 196; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 197; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00 198; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 199; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 200 201; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 202; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0 203; VI: v_or_b32 204; VI: buffer_store_dword 205define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 206 %y = fadd <2 x half> %x, <half 1.0, half 1.0> 207 store <2 x half> %y, <2 x half> addrspace(1)* %out 208 ret void 209} 210 211; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16: 212; GFX9: s_load_dword [[VAL:s[0-9]+]] 213; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0] ; encoding 214; GFX9: buffer_store_dword [[REG]] 215 216 217; FIXME: Shouldn't need right shift and SDWA, also extra copy 218; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 219; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00 220; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 221; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 222 223; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 224; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0 225; VI: v_or_b32 226; VI: buffer_store_dword 227define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 228 %y = fadd <2 x half> %x, <half -1.0, half -1.0> 229 store <2 x half> %y, <2 x half> addrspace(1)* %out 230 ret void 231} 232 233; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16: 234; GFX9: s_load_dword [[VAL:s[0-9]+]] 235; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0] ; encoding 236; GFX9: buffer_store_dword [[REG]] 237 238; FIXME: Shouldn't need right shift and SDWA, also extra copy 239; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 240; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 241; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 242; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 243 244; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 245; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0 246; VI: v_or_b32 247; VI: buffer_store_dword 248define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 249 %y = fadd <2 x half> %x, <half 2.0, half 2.0> 250 store <2 x half> %y, <2 x half> addrspace(1)* %out 251 ret void 252} 253 254; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16: 255; GFX9: s_load_dword [[VAL:s[0-9]+]] 256; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0] ; encoding 257; GFX9: buffer_store_dword [[REG]] 258 259; FIXME: Shouldn't need right shift and SDWA, also extra copy 260; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 261; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000 262; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 263; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 264 265; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 266; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0 267; VI: v_or_b32 268; VI: buffer_store_dword 269define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 270 %y = fadd <2 x half> %x, <half -2.0, half -2.0> 271 store <2 x half> %y, <2 x half> addrspace(1)* %out 272 ret void 273} 274 275; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16: 276; GFX9: s_load_dword [[VAL:s[0-9]+]] 277; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0] ; encoding 278; GFX9: buffer_store_dword [[REG]] 279 280; FIXME: Shouldn't need right shift and SDWA, also extra copy 281; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 282; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 283; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 284; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 285 286; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 287; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0 288; VI: v_or_b32 289; VI: buffer_store_dword 290define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 291 %y = fadd <2 x half> %x, <half 4.0, half 4.0> 292 store <2 x half> %y, <2 x half> addrspace(1)* %out 293 ret void 294} 295 296; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16: 297; GFX9: s_load_dword [[VAL:s[0-9]+]] 298; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0] ; encoding 299; GFX9: buffer_store_dword [[REG]] 300 301; FIXME: Shouldn't need right shift and SDWA, also extra copy 302; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 303; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400 304; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 305; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 306 307; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 308; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0 309; VI: v_or_b32 310; VI: buffer_store_dword 311define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 312 %y = fadd <2 x half> %x, <half -4.0, half -4.0> 313 store <2 x half> %y, <2 x half> addrspace(1)* %out 314 ret void 315} 316 317; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16: 318; GFX9: buffer_load_dword [[VAL:v[0-9]+]] 319; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 320; GFX9: buffer_store_dword [[REG]] 321 322; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 323; VI-DAG: buffer_load_dword 324; VI-NOT: and 325; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 326; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} 327; VI: v_or_b32 328; VI: buffer_store_dword 329define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 330 %x = load <2 x half>, <2 x half> addrspace(1)* %in 331 %y = fadd <2 x half> %x, <half 0.5, half 0.5> 332 store <2 x half> %y, <2 x half> addrspace(1)* %out 333 ret void 334} 335 336; GCN-LABEL: {{^}}commute_add_literal_v2f16: 337; GFX10: v_pk_add_f16 v0, 0x6400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x00,0x0f,0xcc,0xff,0x00,0x02,0x10,0x00,0x64,0x00,0x00] 338 339; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]] 340; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding 341; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x00,0x8f,0xd3,0x00,0x09,0x00,0x08] 342; GFX9: buffer_store_dword [[REG]] 343 344; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding 345; VI-DAG: buffer_load_dword 346; VI-NOT: and 347; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} 348; gfx8 does not support sreg or imm in sdwa - this will be move then 349; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] 350; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 351; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 352; VI: buffer_store_dword 353define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 354 %x = load <2 x half>, <2 x half> addrspace(1)* %in 355 %y = fadd <2 x half> %x, <half 1024.0, half 1024.0> 356 store <2 x half> %y, <2 x half> addrspace(1)* %out 357 ret void 358} 359 360; GCN-LABEL: {{^}}add_inline_imm_1_v2f16: 361; GFX9: s_load_dword [[VAL:s[0-9]+]] 362; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0] ; encoding 363; GFX9: buffer_store_dword [[REG]] 364 365; FIXME: Shouldn't need right shift and SDWA, also extra copy 366; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 367; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1 ; encoding 368; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 369; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 370 371; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 372; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1 ; encoding 373; VI: v_or_b32 374; VI: buffer_store_dword 375define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 376 %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001> 377 store <2 x half> %y, <2 x half> addrspace(1)* %out 378 ret void 379} 380 381; GCN-LABEL: {{^}}add_inline_imm_2_v2f16: 382; GFX9: s_load_dword [[VAL:s[0-9]+]] 383; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0] ; encoding 384; GFX9: buffer_store_dword [[REG]] 385 386 387; FIXME: Shouldn't need right shift and SDWA, also extra copy 388; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 389; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2 ; encoding 390; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 391; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 392 393; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 394; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2 ; encoding 395; VI: v_or_b32 396; VI: buffer_store_dword 397define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 398 %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002> 399 store <2 x half> %y, <2 x half> addrspace(1)* %out 400 ret void 401} 402 403; GCN-LABEL: {{^}}add_inline_imm_16_v2f16: 404; GFX9: s_load_dword [[VAL:s[0-9]+]] 405; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0] ; encoding 406; GFX9: buffer_store_dword [[REG]] 407 408 409; FIXME: Shouldn't need right shift and SDWA, also extra copy 410; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 411; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16 ; encoding 412; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 413; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 414 415; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 416; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16 ; encoding 417; VI: v_or_b32 418; VI: buffer_store_dword 419define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 420 %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010> 421 store <2 x half> %y, <2 x half> addrspace(1)* %out 422 ret void 423} 424 425; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16: 426; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, -1 427; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] 428; GFX9: buffer_store_dword [[REG]] 429 430; VI: s_load_dword [[VAL:s[0-9]+]] 431; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1 ; encoding 432; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] 433; VI: buffer_store_dword [[REG]] 434define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 435 %xbc = bitcast <2 x half> %x to i32 436 %y = add i32 %xbc, -1 437 %ybc = bitcast i32 %y to <2 x half> 438 store <2 x half> %ybc, <2 x half> addrspace(1)* %out 439 ret void 440} 441 442; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16: 443; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfffefffe 444; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] 445; GFX9: buffer_store_dword [[REG]] 446 447; VI: s_load_dword [[VAL:s[0-9]+]] 448; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe ; encoding 449; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] 450; VI: buffer_store_dword [[REG]] 451define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 452 %xbc = bitcast <2 x half> %x to i32 453 %y = add i32 %xbc, 4294901758 ; 0xfffefffe 454 %ybc = bitcast i32 %y to <2 x half> 455 store <2 x half> %ybc, <2 x half> addrspace(1)* %out 456 ret void 457} 458 459; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16: 460; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfff0fff0 461; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] 462; GFX9: buffer_store_dword [[REG]] 463 464 465; VI: s_load_dword [[VAL:s[0-9]+]] 466; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0 ; encoding 467; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] 468; VI: buffer_store_dword [[REG]] 469define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 470 %xbc = bitcast <2 x half> %x to i32 471 %y = add i32 %xbc, 4293984240 ; 0xfff0fff0 472 %ybc = bitcast i32 %y to <2 x half> 473 store <2 x half> %ybc, <2 x half> addrspace(1)* %out 474 ret void 475} 476 477; GCN-LABEL: {{^}}add_inline_imm_63_v2f16: 478; GFX9: s_load_dword [[VAL:s[0-9]+]] 479; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63 480; GFX9: buffer_store_dword [[REG]] 481 482; FIXME: Shouldn't need right shift and SDWA, also extra copy 483; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 484; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63 485; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 486; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 487 488; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 489; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63 490; VI: v_or_b32 491; VI: buffer_store_dword 492define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 493 %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F> 494 store <2 x half> %y, <2 x half> addrspace(1)* %out 495 ret void 496} 497 498; GCN-LABEL: {{^}}add_inline_imm_64_v2f16: 499; GFX9: s_load_dword [[VAL:s[0-9]+]] 500; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64 501; GFX9: buffer_store_dword [[REG]] 502 503; FIXME: Shouldn't need right shift and SDWA, also extra copy 504; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 505; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64 506; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 507; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 508 509; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 510; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64 511; VI: v_or_b32 512; VI: buffer_store_dword 513define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 514 %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040> 515 store <2 x half> %y, <2 x half> addrspace(1)* %out 516 ret void 517} 518 519; GCN-LABEL: {{^}}mul_inline_imm_0.5_v2i16: 520; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x38003800 521; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] 522 523; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00] 524define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) { 525 %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0.5, half 0.5> to <2 x i16>) 526 ret <2 x i16> %y 527} 528 529; GCN-LABEL: {{^}}mul_inline_imm_neg_0.5_v2i16: 530; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xb800b800 531; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] 532 533; GFX10: v_pk_mul_lo_u16 v0, 0xb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0x00] 534define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) { 535 %y = mul <2 x i16> %x, bitcast (<2 x half> <half -0.5, half -0.5> to <2 x i16>) 536 ret <2 x i16> %y 537} 538 539; GCN-LABEL: {{^}}mul_inline_imm_1.0_v2i16: 540; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00 541; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] 542 543; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00] 544define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) { 545 %y = mul <2 x i16> %x, bitcast (<2 x half> <half 1.0, half 1.0> to <2 x i16>) 546 ret <2 x i16> %y 547} 548 549; GCN-LABEL: {{^}}mul_inline_imm_neg_1.0_v2i16: 550; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00bc00 551; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] 552 553; GFX10: v_pk_mul_lo_u16 v0, 0xbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0x00] 554define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) { 555 %y = mul <2 x i16> %x, bitcast (<2 x half> <half -1.0, half -1.0> to <2 x i16>) 556 ret <2 x i16> %y 557} 558 559; GCN-LABEL: {{^}}shl_inline_imm_2.0_v2i16: 560; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40004000 561; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]] 562 563; GFX10: v_pk_lshlrev_b16 v0, v0, 0x4000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x40,0x00,0x00] 564define <2 x i16> @shl_inline_imm_2.0_v2i16(<2 x i16> %x) { 565 %y = shl <2 x i16> bitcast (<2 x half> <half 2.0, half 2.0> to <2 x i16>), %x 566 ret <2 x i16> %y 567} 568 569; GCN-LABEL: {{^}}shl_inline_imm_neg_2.0_v2i16: 570; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc000c000 571; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]] 572 573; GFX10: v_pk_lshlrev_b16 v0, v0, 0xc000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc0,0x00,0x00] 574define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) { 575 %y = shl <2 x i16> bitcast (<2 x half> <half -2.0, half -2.0> to <2 x i16>), %x 576 ret <2 x i16> %y 577} 578 579; GCN-LABEL: {{^}}mul_inline_imm_4.0_v2i16: 580; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004400 581; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] 582 583; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00] 584define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) { 585 %y = mul <2 x i16> %x, bitcast (<2 x half> <half 4.0, half 4.0> to <2 x i16>) 586 ret <2 x i16> %y 587 588} 589 590; GCN-LABEL: {{^}}mul_inline_imm_neg_4.0_v2i16: 591; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc400c400 592; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] 593 594; GFX10: v_pk_mul_lo_u16 v0, 0xc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0x00] 595define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) { 596 %y = mul <2 x i16> %x, bitcast (<2 x half> <half -4.0, half -4.0> to <2 x i16>) 597 ret <2 x i16> %y 598} 599 600; GCN-LABEL: {{^}}mul_inline_imm_inv2pi_v2i16: 601; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x31183118 602; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] 603 604; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00] 605define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) { 606 %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0xH3118, half 0xH3118> to <2 x i16>) 607 ret <2 x i16> %y 608} 609 610attributes #0 = { nounwind } 611