1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s 2 3; GCN-LABEL: {{^}}accvgpr_write_read: 4; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1 5; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]] 6; GFX908: global_store_dword v{{[0-9]+}}, [[VREG]], s{{\[[0-9]+:[0-9]+\]}} 7define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) { 8bb: 9 %in.1 = load float, float addrspace(1)* %arg 10 %init = tail call float asm "v_accvgpr_write $0, 1", "=a"() 11 %read = tail call float asm "v_accvgpr_read $0, $1", "=v,a"(float %init) 12 store float %read, float addrspace(1)* %arg 13 ret void 14} 15 16; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_avva 17; GFX908: v_accvgpr_write_b32 18; GFX908: v_accvgpr_write_b32 19; GFX908: v_accvgpr_write_b32 20; GFX908: v_accvgpr_write_b32 21; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 22; GFX908: v_accvgpr_read_b32 23; GFX908: v_accvgpr_read_b32 24; GFX908: v_accvgpr_read_b32 25; GFX908: v_accvgpr_read_b32 26define amdgpu_kernel void @v_mfma_f32_4x4x1f32_avva(<4 x float> addrspace(1)* %arg) { 27bb: 28 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 29 %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,v,v,a"(float 1.0, float 2.0, <4 x float> %in.1) 30 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 31 ret void 32} 33 34; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_aaaa 35; GFX908: v_accvgpr_write_b32 36; GFX908: v_accvgpr_write_b32 37; GFX908: v_accvgpr_write_b32 38; GFX908: v_accvgpr_write_b32 39; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], a{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] 40; GFX908: v_accvgpr_read_b32 41; GFX908: v_accvgpr_read_b32 42; GFX908: v_accvgpr_read_b32 43; GFX908: v_accvgpr_read_b32 44define amdgpu_kernel void @v_mfma_f32_4x4x1f32_aaaa(<4 x float> addrspace(1)* %arg) { 45bb: 46 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 47 %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,a,a,a"(float 1.0, float 2.0, <4 x float> %in.1) 48 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 49 ret void 50} 51 52; GCN-LABEL: {{^}}v_mfma_f32_4x4x4f16_aaaa 53; GFX908: v_accvgpr_write_b32 54; GFX908: v_accvgpr_write_b32 55; GFX908: v_accvgpr_write_b32 56; GFX908: v_accvgpr_write_b32 57; GFX908: v_accvgpr_write_b32 58; GFX908: v_accvgpr_write_b32 59; GFX908: v_accvgpr_write_b32 60; GFX908: v_accvgpr_write_b32 61; GFX908: v_mfma_f32_4x4x4f16 a[{{[0-9:]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9:]+}}] 62; GFX908: v_accvgpr_read_b32 63; GFX908: v_accvgpr_read_b32 64; GFX908: v_accvgpr_read_b32 65; GFX908: v_accvgpr_read_b32 66define amdgpu_kernel void @v_mfma_f32_4x4x4f16_aaaa(<4 x float> addrspace(1)* %arg) { 67bb: 68 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 69 %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x4f16 $0, $1, $2, $3", "=a,a,a,a"(<4 x half> <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>, <4 x half> <half 0xH03FF, half 0xH03FF, half 0xH03FF, half 0xH03FF>, <4 x float> %in.1) 70 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 71 ret void 72} 73 74; GCN-LABEL: {{^}}v_mfma_f32_16x16x1f32_avaa 75; GFX908: v_accvgpr_write_b32 76; GFX908: v_accvgpr_write_b32 77; GFX908: v_accvgpr_write_b32 78; GFX908: v_accvgpr_write_b32 79; GFX908: v_accvgpr_write_b32 80; GFX908: v_accvgpr_write_b32 81; GFX908: v_accvgpr_write_b32 82; GFX908: v_accvgpr_write_b32 83; GFX908: v_accvgpr_write_b32 84; GFX908: v_accvgpr_write_b32 85; GFX908: v_accvgpr_write_b32 86; GFX908: v_accvgpr_write_b32 87; GFX908: v_accvgpr_write_b32 88; GFX908: v_accvgpr_write_b32 89; GFX908: v_accvgpr_write_b32 90; GFX908: v_accvgpr_write_b32 91; GFX908: v_accvgpr_write_b32 92; GFX908: v_mfma_f32_16x16x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] 93; GFX908: v_accvgpr_read_b32 94; GFX908: v_accvgpr_read_b32 95; GFX908: v_accvgpr_read_b32 96; GFX908: v_accvgpr_read_b32 97; GFX908: v_accvgpr_read_b32 98; GFX908: v_accvgpr_read_b32 99; GFX908: v_accvgpr_read_b32 100; GFX908: v_accvgpr_read_b32 101; GFX908: v_accvgpr_read_b32 102; GFX908: v_accvgpr_read_b32 103; GFX908: v_accvgpr_read_b32 104; GFX908: v_accvgpr_read_b32 105; GFX908: v_accvgpr_read_b32 106; GFX908: v_accvgpr_read_b32 107; GFX908: v_accvgpr_read_b32 108; GFX908: v_accvgpr_read_b32 109define amdgpu_kernel void @v_mfma_f32_16x16x1f32_avaa(<16 x float> addrspace(1)* %arg) { 110bb: 111 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 112 %mai.1 = tail call <16 x float> asm "v_mfma_f32_16x16x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <16 x float> %in.1) 113 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 114 ret void 115} 116 117; GCN-LABEL: {{^}}v_mfma_f32_32x32x1f32_avaa 118; GFX908: v_accvgpr_write_b32 119; GFX908: v_accvgpr_write_b32 120; GFX908: v_accvgpr_write_b32 121; GFX908: v_accvgpr_write_b32 122; GFX908: v_accvgpr_write_b32 123; GFX908: v_accvgpr_write_b32 124; GFX908: v_accvgpr_write_b32 125; GFX908: v_accvgpr_write_b32 126; GFX908: v_accvgpr_write_b32 127; GFX908: v_accvgpr_write_b32 128; GFX908: v_accvgpr_write_b32 129; GFX908: v_accvgpr_write_b32 130; GFX908: v_accvgpr_write_b32 131; GFX908: v_accvgpr_write_b32 132; GFX908: v_accvgpr_write_b32 133; GFX908: v_accvgpr_write_b32 134; GFX908: v_accvgpr_write_b32 135; GFX908: v_accvgpr_write_b32 136; GFX908: v_accvgpr_write_b32 137; GFX908: v_accvgpr_write_b32 138; GFX908: v_accvgpr_write_b32 139; GFX908: v_accvgpr_write_b32 140; GFX908: v_accvgpr_write_b32 141; GFX908: v_accvgpr_write_b32 142; GFX908: v_accvgpr_write_b32 143; GFX908: v_accvgpr_write_b32 144; GFX908: v_accvgpr_write_b32 145; GFX908: v_accvgpr_write_b32 146; GFX908: v_accvgpr_write_b32 147; GFX908: v_accvgpr_write_b32 148; GFX908: v_accvgpr_write_b32 149; GFX908: v_accvgpr_write_b32 150; GFX908: v_accvgpr_write_b32 151; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] 152; GFX908: v_accvgpr_read_b32 153; GFX908: v_accvgpr_read_b32 154; GFX908: v_accvgpr_read_b32 155; GFX908: v_accvgpr_read_b32 156; GFX908: v_accvgpr_read_b32 157; GFX908: v_accvgpr_read_b32 158; GFX908: v_accvgpr_read_b32 159; GFX908: v_accvgpr_read_b32 160; GFX908: v_accvgpr_read_b32 161; GFX908: v_accvgpr_read_b32 162; GFX908: v_accvgpr_read_b32 163; GFX908: v_accvgpr_read_b32 164; GFX908: v_accvgpr_read_b32 165; GFX908: v_accvgpr_read_b32 166; GFX908: v_accvgpr_read_b32 167; GFX908: v_accvgpr_read_b32 168; GFX908: v_accvgpr_read_b32 169; GFX908: v_accvgpr_read_b32 170; GFX908: v_accvgpr_read_b32 171; GFX908: v_accvgpr_read_b32 172; GFX908: v_accvgpr_read_b32 173; GFX908: v_accvgpr_read_b32 174; GFX908: v_accvgpr_read_b32 175; GFX908: v_accvgpr_read_b32 176; GFX908: v_accvgpr_read_b32 177; GFX908: v_accvgpr_read_b32 178; GFX908: v_accvgpr_read_b32 179; GFX908: v_accvgpr_read_b32 180; GFX908: v_accvgpr_read_b32 181; GFX908: v_accvgpr_read_b32 182; GFX908: v_accvgpr_read_b32 183; GFX908: v_accvgpr_read_b32 184define amdgpu_kernel void @v_mfma_f32_32x32x1f32_avaa(<32 x i32> addrspace(1)* %arg) { 185bb: 186 %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg 187 %mai.1 = tail call <32 x i32> asm "v_mfma_f32_32x32x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <32 x i32> %in.1) 188 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg 189 ret void 190} 191