1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI 4 5define amdgpu_kernel void @madak_f16( 6; SI-LABEL: madak_f16: 7; SI: ; %bb.0: ; %entry 8; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 10; SI-NEXT: s_mov_b32 s3, 0xf000 11; SI-NEXT: s_mov_b32 s2, -1 12; SI-NEXT: s_mov_b32 s10, s2 13; SI-NEXT: s_mov_b32 s11, s3 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_mov_b32 s12, s6 16; SI-NEXT: s_mov_b32 s13, s7 17; SI-NEXT: s_mov_b32 s14, s2 18; SI-NEXT: s_mov_b32 s15, s3 19; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 20; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 21; SI-NEXT: s_mov_b32 s0, s4 22; SI-NEXT: s_mov_b32 s1, s5 23; SI-NEXT: s_waitcnt vmcnt(1) 24; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 25; SI-NEXT: s_waitcnt vmcnt(0) 26; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 27; SI-NEXT: v_madak_f32 v0, v1, v0, 0x41200000 28; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 29; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 30; SI-NEXT: s_endpgm 31; 32; VI-LABEL: madak_f16: 33; VI: ; %bb.0: ; %entry 34; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 35; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 36; VI-NEXT: s_mov_b32 s3, 0xf000 37; VI-NEXT: s_mov_b32 s2, -1 38; VI-NEXT: s_mov_b32 s10, s2 39; VI-NEXT: s_waitcnt lgkmcnt(0) 40; VI-NEXT: s_mov_b32 s0, s4 41; VI-NEXT: s_mov_b32 s1, s5 42; VI-NEXT: s_mov_b32 s4, s6 43; VI-NEXT: s_mov_b32 s5, s7 44; VI-NEXT: s_mov_b32 s6, s2 45; VI-NEXT: s_mov_b32 s7, s3 46; VI-NEXT: s_mov_b32 s11, s3 47; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 48; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 49; VI-NEXT: s_waitcnt vmcnt(0) 50; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 51; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 52; VI-NEXT: s_endpgm 53 half addrspace(1)* %r, 54 half addrspace(1)* %a, 55 half addrspace(1)* %b) #0 { 56entry: 57 %a.val = load half, half addrspace(1)* %a 58 %b.val = load half, half addrspace(1)* %b 59 60 %t.val = fmul half %a.val, %b.val 61 %r.val = fadd half %t.val, 10.0 62 63 store half %r.val, half addrspace(1)* %r 64 ret void 65} 66 67define amdgpu_kernel void @madak_f16_use_2( 68; SI-LABEL: madak_f16_use_2: 69; SI: ; %bb.0: ; %entry 70; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 71; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 72; SI-NEXT: s_mov_b32 s3, 0xf000 73; SI-NEXT: s_mov_b32 s2, -1 74; SI-NEXT: s_mov_b32 s18, s2 75; SI-NEXT: s_waitcnt lgkmcnt(0) 76; SI-NEXT: s_mov_b32 s16, s8 77; SI-NEXT: s_mov_b32 s17, s9 78; SI-NEXT: s_mov_b32 s19, s3 79; SI-NEXT: s_mov_b32 s8, s10 80; SI-NEXT: s_mov_b32 s9, s11 81; SI-NEXT: s_mov_b32 s10, s2 82; SI-NEXT: s_mov_b32 s11, s3 83; SI-NEXT: s_mov_b32 s14, s2 84; SI-NEXT: s_mov_b32 s15, s3 85; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 86; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 87; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 88; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 89; SI-NEXT: s_mov_b32 s0, s4 90; SI-NEXT: s_mov_b32 s1, s5 91; SI-NEXT: s_mov_b32 s8, s6 92; SI-NEXT: s_mov_b32 s9, s7 93; SI-NEXT: s_waitcnt vmcnt(2) 94; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 95; SI-NEXT: s_waitcnt vmcnt(1) 96; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 97; SI-NEXT: s_waitcnt vmcnt(0) 98; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 99; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000 100; SI-NEXT: v_mac_f32_e32 v3, v0, v2 101; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 102; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 103; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 104; SI-NEXT: buffer_store_short v1, off, s[8:11], 0 105; SI-NEXT: s_endpgm 106; 107; VI-LABEL: madak_f16_use_2: 108; VI: ; %bb.0: ; %entry 109; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 110; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 111; VI-NEXT: s_mov_b32 s3, 0xf000 112; VI-NEXT: s_mov_b32 s2, -1 113; VI-NEXT: s_mov_b32 s18, s2 114; VI-NEXT: s_waitcnt lgkmcnt(0) 115; VI-NEXT: s_mov_b32 s16, s8 116; VI-NEXT: s_mov_b32 s17, s9 117; VI-NEXT: s_mov_b32 s19, s3 118; VI-NEXT: s_mov_b32 s8, s10 119; VI-NEXT: s_mov_b32 s9, s11 120; VI-NEXT: s_mov_b32 s10, s2 121; VI-NEXT: s_mov_b32 s11, s3 122; VI-NEXT: s_mov_b32 s14, s2 123; VI-NEXT: s_mov_b32 s15, s3 124; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 125; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 126; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 127; VI-NEXT: v_mov_b32_e32 v3, 0x4900 128; VI-NEXT: s_mov_b32 s0, s4 129; VI-NEXT: s_mov_b32 s1, s5 130; VI-NEXT: s_mov_b32 s8, s6 131; VI-NEXT: s_mov_b32 s9, s7 132; VI-NEXT: s_waitcnt vmcnt(1) 133; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 134; VI-NEXT: s_waitcnt vmcnt(0) 135; VI-NEXT: v_mac_f16_e32 v3, v0, v2 136; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 137; VI-NEXT: buffer_store_short v3, off, s[8:11], 0 138; VI-NEXT: s_endpgm 139 half addrspace(1)* %r0, 140 half addrspace(1)* %r1, 141 half addrspace(1)* %a, 142 half addrspace(1)* %b, 143 half addrspace(1)* %c) #0 { 144entry: 145 %a.val = load volatile half, half addrspace(1)* %a 146 %b.val = load volatile half, half addrspace(1)* %b 147 %c.val = load volatile half, half addrspace(1)* %c 148 149 %t0.val = fmul half %a.val, %b.val 150 %t1.val = fmul half %a.val, %c.val 151 %r0.val = fadd half %t0.val, 10.0 152 %r1.val = fadd half %t1.val, 10.0 153 154 store half %r0.val, half addrspace(1)* %r0 155 store half %r1.val, half addrspace(1)* %r1 156 ret void 157} 158 159attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } 160