1;RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s 2;RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s 3;RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s 4 5define float @v_exp_f32(float %arg0) { 6; SI-LABEL: v_exp_f32: 7; SI: ; %bb.0: 8; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 10; SI-NEXT: v_exp_f32_e32 v0, v0 11; SI-NEXT: s_setpc_b64 s[30:31] 12; 13; VI-LABEL: v_exp_f32: 14; VI: ; %bb.0: 15; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; VI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 17; VI-NEXT: v_exp_f32_e32 v0, v0 18; VI-NEXT: s_setpc_b64 s[30:31] 19; 20; GFX9-LABEL: v_exp_f32: 21; GFX9: ; %bb.0: 22; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 24; GFX9-NEXT: v_exp_f32_e32 v0, v0 25; GFX9-NEXT: s_setpc_b64 s[30:31] 26 %result = call float @llvm.exp.f32(float %arg0) 27 ret float %result 28} 29 30define <2 x float> @v_exp_v2f32(<2 x float> %arg0) { 31; GCN-LABEL: v_exp_v2f32: 32; GCN: ; %bb.0: 33; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GCN-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b 35; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} 36; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} 37; GCN-NEXT: v_exp_f32_e32 v0, v0 38; GCN-NEXT: v_exp_f32_e32 v1, v1 39; GCN-NEXT: s_setpc_b64 s[30:31] 40 %result = call <2 x float> @llvm.exp.v2f32(<2 x float> %arg0) 41 ret <2 x float> %result 42} 43 44define <3 x float> @v_exp_v3f32(<3 x float> %arg0) { 45; GCN-LABEL: v_exp_v3f32: 46; GCN: ; %bb.0: 47; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 48; GCN-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b 49; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} 50; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} 51; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} 52; GCN-NEXT: v_exp_f32_e32 v0, v0 53; GCN-NEXT: v_exp_f32_e32 v1, v1 54; GCN-NEXT: v_exp_f32_e32 v2, v2 55; GCN-NEXT: s_setpc_b64 s[30:31] 56; 57 %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %arg0) 58 ret <3 x float> %result 59} 60 61define <4 x float> @v_exp_v4f32(<4 x float> %arg0) { 62; SI-LABEL: v_exp_v4f32: 63; SI: ; %bb.0: 64; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 65; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b 66; SI-NEXT: v_mul_f32_e32 v0, [[SREG]], v0 67; SI-NEXT: v_mul_f32_e32 v1, [[SREG]], v1 68; SI-NEXT: v_mul_f32_e32 v2, [[SREG]], v2 69; SI-NEXT: v_mul_f32_e32 v3, [[SREG]], v3 70; SI-NEXT: v_exp_f32_e32 v0, v0 71; SI-NEXT: v_exp_f32_e32 v1, v1 72; SI-NEXT: v_exp_f32_e32 v2, v2 73; SI-NEXT: v_exp_f32_e32 v3, v3 74; SI-NEXT: s_setpc_b64 s[30:31] 75 %result = call <4 x float> @llvm.exp.v4f32(<4 x float> %arg0) 76 ret <4 x float> %result 77} 78 79define half @v_exp_f16(half %arg0) { 80; SI-LABEL: v_exp_f16: 81; SI: ; %bb.0: 82; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 84; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 85; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 86; SI-NEXT: v_exp_f32_e32 v0, v0 87; SI-NEXT: s_setpc_b64 s[30:31] 88; 89; VI-LABEL: v_exp_f16: 90; VI: ; %bb.0: 91; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; VI-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0 93; VI-NEXT: v_exp_f16_e32 v0, v0 94; VI-NEXT: s_setpc_b64 s[30:31] 95; 96; GFX9-LABEL: v_exp_f16: 97; GFX9: ; %bb.0: 98; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 99; GFX9-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0 100; GFX9-NEXT: v_exp_f16_e32 v0, v0 101; GFX9-NEXT: s_setpc_b64 s[30:31] 102 %result = call half @llvm.exp.f16(half %arg0) 103 ret half %result 104} 105 106define <2 x half> @v_exp_v2f16(<2 x half> %arg0) { 107; SI-LABEL: v_exp_v2f16: 108; SI: ; %bb.0: 109; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 110; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 111; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 112; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b 113; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 114; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 115; SI-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} 116; SI-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} 117; SI-NEXT: v_exp_f32_e32 v0, v0 118; SI-NEXT: v_exp_f32_e32 v1, v1 119; SI-NEXT: s_setpc_b64 s[30:31] 120; 121; VI-LABEL: v_exp_v2f16: 122; VI: ; %bb.0: 123; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 124; VI-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5 125; VI-NEXT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] 126; VI-NEXT: v_mul_f16_sdwa [[MUL1:v[0-9]+]], v{{[0-9]+}}, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 127; VI-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v{{[0-9]+}} 128; VI-NEXT: v_exp_f16_sdwa [[MUL1]], [[MUL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 129; VI-NEXT: v_exp_f16_e32 [[MUL2]], [[MUL2]] 130; VI-NEXT: v_or_b32_e32 v{{[0-9]+}}, [[MUL2]], [[MUL1]] 131; VI-NEXT: s_setpc_b64 s[30:31] 132; 133; GFX9-LABEL: v_exp_v2f16: 134; GFX9: ; %bb.0: 135; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 136; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5 137; GFX9-NEXT: v_pk_mul_f16 v0, v0, [[SREG]] op_sel_hi:[1,0] 138; GFX9-NEXT: v_exp_f16_e32 v1, v0 139; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 140; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 141; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 142; GFX9-NEXT: s_setpc_b64 s[30:31] 143 %result = call <2 x half> @llvm.exp.v2f16(<2 x half> %arg0) 144 ret <2 x half> %result 145} 146 147; define <3 x half> @v_exp_v3f16(<3 x half> %arg0) { 148; %result = call <3 x half> @llvm.exp.v3f16(<3 x half> %arg0) 149; ret <3 x half> %result 150; } 151 152define <4 x half> @v_exp_v4f16(<4 x half> %arg0) { 153; SI-LABEL: v_exp_v4f16: 154; SI: ; %bb.0: 155; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 156; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 157; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 158; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 159; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 160; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b 161; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 162; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 163; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 164; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 165; SI-NEXT: v_mul_f32_e32 v0, [[SREG]], v0 166; SI-NEXT: v_mul_f32_e32 v1, [[SREG]], v1 167; SI-NEXT: v_mul_f32_e32 v2, [[SREG]], v2 168; SI-NEXT: v_mul_f32_e32 v3, [[SREG]], v3 169; SI-NEXT: v_exp_f32_e32 v0, v0 170; SI-NEXT: v_exp_f32_e32 v1, v1 171; SI-NEXT: v_exp_f32_e32 v2, v2 172; SI-NEXT: v_exp_f32_e32 v3, v3 173; SI-NEXT: s_setpc_b64 s[30:31] 174; 175; VI-LABEL: v_exp_v4f16: 176; VI: ; %bb.0: 177; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; VI-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5 179; VI-NEXT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] 180; VI-NEXT: v_mul_f16_e32 [[MUL1:v[0-9]+]], [[SREG]], v1 181; VI-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v0 182; VI-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 183; VI-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 184; VI-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]] 185; VI-NEXT: v_exp_f16_sdwa [[EXP2:v[0-9]+]], v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 186; VI-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL2]] 187; VI-NEXT: v_exp_f16_sdwa [[EXP4:v[0-9]+]], v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 188; VI-NEXT: v_or_b32_e32 v1, [[EXP1]], [[EXP2]] 189; VI-NEXT: v_or_b32_e32 v0, [[EXP3]], [[EXP4]] 190; VI-NEXT: s_setpc_b64 s[30:31] 191; 192; GFX9-LABEL: v_exp_v4f16: 193; GFX9: ; %bb.0: 194; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5 196; GFX9-NEXT: v_mul_f16_e32 [[MUL1:v[0-9]+]], [[SREG]], v1 197; GFX9-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v0 198; GFX9-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 199; GFX9-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 200; GFX9-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]] 201; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL2]] 202; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL4]] 203; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL3]] 204; GFX9-NEXT: v_mov_b32_e32 [[VCONST:v[0-9]+]], 0xffff 205; GFX9-NEXT: v_and_b32_e32 [[AND1:v[0-9]+]], [[VCONST]], [[EXP2]] 206; GFX9-NEXT: v_and_b32_e32 [[AND2:v[0-9]+]], [[VCONST]], [[EXP1]] 207; GFX9-NEXT: v_lshl_or_b32 v0, [[EXP3]], 16, [[AND1]] 208; GFX9-NEXT: v_lshl_or_b32 v1, [[EXP4]], 16, [[AND2]] 209; GFX9-NEXT: s_setpc_b64 s[30:31] 210 %result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0) 211 ret <4 x half> %result 212} 213 214declare float @llvm.exp.f32(float) 215declare <2 x float> @llvm.exp.v2f32(<2 x float>) 216declare <3 x float> @llvm.exp.v3f32(<3 x float>) 217declare <4 x float> @llvm.exp.v4f32(<4 x float>) 218 219declare half @llvm.exp.f16(half) 220declare <2 x half> @llvm.exp.v2f16(<2 x half>) 221declare <3 x half> @llvm.exp.v3f16(<3 x half>) 222declare <4 x half> @llvm.exp.v4f16(<4 x half>) 223 224