1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; GCN-LABEL: {{^}}v_mul_i16: 6; SI: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} 7; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] 8; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] 9; SI: v_mul_u32_u24 10 11; GFX89: v_mul_lo_u16_e32 v0, v0, v1 12define i16 @v_mul_i16(i16 %a, i16 %b) { 13 %r.val = mul i16 %a, %b 14 ret i16 %r.val 15} 16 17; FIXME: Should emit scalar mul or maybe i16 v_mul here 18; GCN-LABEL: {{^}}s_mul_i16: 19; SI: v_mul_u32_u24 20; VI: s_mul_i16 21define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) { 22 %r.val = mul i16 %a, %b 23 store volatile i16 %r.val, i16 addrspace(1)* null 24 ret void 25} 26 27; FIXME: Should emit u16 mul here. Instead it's worse than SI 28; GCN-LABEL: {{^}}v_mul_i16_uniform_load: 29; SI: v_mul_u32_u24 30; GFX89: v_mul_lo_i32 31define amdgpu_kernel void @v_mul_i16_uniform_load( 32 i16 addrspace(1)* %r, 33 i16 addrspace(1)* %a, 34 i16 addrspace(1)* %b) { 35entry: 36 %a.val = load i16, i16 addrspace(1)* %a 37 %b.val = load i16, i16 addrspace(1)* %b 38 %r.val = mul i16 %a.val, %b.val 39 store i16 %r.val, i16 addrspace(1)* %r 40 ret void 41} 42 43; GCN-LABEL: {{^}}v_mul_v2i16: 44; SI: v_mul_lo_i32 45; SI: v_mul_lo_i32 46 47; VI: v_mul_lo_u16_sdwa 48; VI: v_mul_lo_u16_e32 49; VI: v_or_b32_e32 50 51 52; GFX9: s_waitcnt 53; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 54; GFX9-NEXT: s_setpc_b64 55define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) { 56 %r.val = mul <2 x i16> %a, %b 57 ret <2 x i16> %r.val 58} 59 60; FIXME: Unpack garbage on gfx9 61; GCN-LABEL: {{^}}v_mul_v3i16: 62; SI: v_mul_lo_i32 63; SI: v_mul_lo_i32 64; SI: v_mul_lo_i32 65 66; VI: v_mul_lo_u16 67; VI: v_mul_lo_u16 68; VI: v_mul_lo_u16 69 70; GFX9: s_waitcnt 71; GFX9-NEXT: v_pk_mul_lo_u16 72; GFX9-NEXT: v_pk_mul_lo_u16 73; GFX9-NEXT: s_setpc_b64 74define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) { 75 %r.val = mul <3 x i16> %a, %b 76 ret <3 x i16> %r.val 77} 78 79; GCN-LABEL: {{^}}v_mul_v4i16: 80; SI: v_mul_lo_i32 81; SI: v_mul_lo_i32 82; SI: v_mul_lo_i32 83; SI: v_mul_lo_i32 84 85; VI: v_mul_lo_u16_sdwa 86; VI: v_mul_lo_u16_e32 87; VI: v_mul_lo_u16_sdwa 88; VI: v_mul_lo_u16_e32 89; VI: v_or_b32_e32 90; VI: v_or_b32_e32 91 92; GFX9: s_waitcnt 93; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 94; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 95; GFX9-NEXT: s_setpc_b64 96define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) { 97 %r.val = mul <4 x i16> %a, %b 98 ret <4 x i16> %r.val 99} 100