1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic -mattr=+fullfp16 < %s | FileCheck %s 3; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic < %s | FileCheck %s --check-prefix=CHECKNOFP16 4 5define float @add_HalfS(<2 x float> %bin.rdx) { 6; CHECK-LABEL: add_HalfS: 7; CHECK: // %bb.0: 8; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 9; CHECK-NEXT: faddp s0, v0.2s 10; CHECK-NEXT: ret 11; 12; CHECKNOFP16-LABEL: add_HalfS: 13; CHECKNOFP16: // %bb.0: 14; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 15; CHECKNOFP16-NEXT: faddp s0, v0.2s 16; CHECKNOFP16-NEXT: ret 17 %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx) 18 ret float %r 19} 20 21define half @add_HalfH(<4 x half> %bin.rdx) { 22; CHECK-LABEL: add_HalfH: 23; CHECK: // %bb.0: 24; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 25; CHECK-NEXT: mov h1, v0.h[3] 26; CHECK-NEXT: mov h2, v0.h[2] 27; CHECK-NEXT: faddp h0, v0.2h 28; CHECK-NEXT: fadd h0, h0, h2 29; CHECK-NEXT: fadd h0, h0, h1 30; CHECK-NEXT: ret 31; 32; CHECKNOFP16-LABEL: add_HalfH: 33; CHECKNOFP16: // %bb.0: 34; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 35; CHECKNOFP16-NEXT: mov h3, v0.h[1] 36; CHECKNOFP16-NEXT: mov h1, v0.h[3] 37; CHECKNOFP16-NEXT: mov h2, v0.h[2] 38; CHECKNOFP16-NEXT: fcvt s0, h0 39; CHECKNOFP16-NEXT: fcvt s3, h3 40; CHECKNOFP16-NEXT: fadd s0, s0, s3 41; CHECKNOFP16-NEXT: fcvt h0, s0 42; CHECKNOFP16-NEXT: fcvt s2, h2 43; CHECKNOFP16-NEXT: fcvt s0, h0 44; CHECKNOFP16-NEXT: fadd s0, s0, s2 45; CHECKNOFP16-NEXT: fcvt h0, s0 46; CHECKNOFP16-NEXT: fcvt s0, h0 47; CHECKNOFP16-NEXT: fcvt s1, h1 48; CHECKNOFP16-NEXT: fadd s0, s0, s1 49; CHECKNOFP16-NEXT: fcvt h0, s0 50; CHECKNOFP16-NEXT: ret 51 %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx) 52 ret half %r 53} 54 55 56define half @add_H(<8 x half> %bin.rdx) { 57; CHECK-LABEL: add_H: 58; CHECK: // %bb.0: 59; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 60; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h 61; CHECK-NEXT: mov h1, v0.h[2] 62; CHECK-NEXT: faddp h2, v0.2h 63; CHECK-NEXT: fadd h1, h2, h1 64; CHECK-NEXT: mov h0, v0.h[3] 65; CHECK-NEXT: fadd h0, h1, h0 66; CHECK-NEXT: ret 67; 68; CHECKNOFP16-LABEL: add_H: 69; CHECKNOFP16: // %bb.0: 70; CHECKNOFP16-NEXT: mov h7, v0.h[1] 71; CHECKNOFP16-NEXT: mov h1, v0.h[7] 72; CHECKNOFP16-NEXT: mov h2, v0.h[6] 73; CHECKNOFP16-NEXT: mov h3, v0.h[5] 74; CHECKNOFP16-NEXT: mov h4, v0.h[4] 75; CHECKNOFP16-NEXT: mov h5, v0.h[3] 76; CHECKNOFP16-NEXT: mov h6, v0.h[2] 77; CHECKNOFP16-NEXT: fcvt s0, h0 78; CHECKNOFP16-NEXT: fcvt s7, h7 79; CHECKNOFP16-NEXT: fadd s0, s0, s7 80; CHECKNOFP16-NEXT: fcvt h0, s0 81; CHECKNOFP16-NEXT: fcvt s6, h6 82; CHECKNOFP16-NEXT: fcvt s0, h0 83; CHECKNOFP16-NEXT: fadd s0, s0, s6 84; CHECKNOFP16-NEXT: fcvt h0, s0 85; CHECKNOFP16-NEXT: fcvt s5, h5 86; CHECKNOFP16-NEXT: fcvt s0, h0 87; CHECKNOFP16-NEXT: fadd s0, s0, s5 88; CHECKNOFP16-NEXT: fcvt h0, s0 89; CHECKNOFP16-NEXT: fcvt s4, h4 90; CHECKNOFP16-NEXT: fcvt s0, h0 91; CHECKNOFP16-NEXT: fadd s0, s0, s4 92; CHECKNOFP16-NEXT: fcvt h0, s0 93; CHECKNOFP16-NEXT: fcvt s3, h3 94; CHECKNOFP16-NEXT: fcvt s0, h0 95; CHECKNOFP16-NEXT: fadd s0, s0, s3 96; CHECKNOFP16-NEXT: fcvt h0, s0 97; CHECKNOFP16-NEXT: fcvt s2, h2 98; CHECKNOFP16-NEXT: fcvt s0, h0 99; CHECKNOFP16-NEXT: fadd s0, s0, s2 100; CHECKNOFP16-NEXT: fcvt h0, s0 101; CHECKNOFP16-NEXT: fcvt s0, h0 102; CHECKNOFP16-NEXT: fcvt s1, h1 103; CHECKNOFP16-NEXT: fadd s0, s0, s1 104; CHECKNOFP16-NEXT: fcvt h0, s0 105; CHECKNOFP16-NEXT: ret 106 %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx) 107 ret half %r 108} 109 110define float @add_S(<4 x float> %bin.rdx) { 111; CHECK-LABEL: add_S: 112; CHECK: // %bb.0: 113; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 114; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s 115; CHECK-NEXT: faddp s0, v0.2s 116; CHECK-NEXT: ret 117; 118; CHECKNOFP16-LABEL: add_S: 119; CHECKNOFP16: // %bb.0: 120; CHECKNOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 121; CHECKNOFP16-NEXT: fadd v0.2s, v0.2s, v1.2s 122; CHECKNOFP16-NEXT: faddp s0, v0.2s 123; CHECKNOFP16-NEXT: ret 124 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx) 125 ret float %r 126} 127 128define double @add_D(<2 x double> %bin.rdx) { 129; CHECK-LABEL: add_D: 130; CHECK: // %bb.0: 131; CHECK-NEXT: faddp d0, v0.2d 132; CHECK-NEXT: ret 133; 134; CHECKNOFP16-LABEL: add_D: 135; CHECKNOFP16: // %bb.0: 136; CHECKNOFP16-NEXT: faddp d0, v0.2d 137; CHECKNOFP16-NEXT: ret 138 %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx) 139 ret double %r 140} 141 142define half @add_2H(<16 x half> %bin.rdx) { 143; CHECK-LABEL: add_2H: 144; CHECK: // %bb.0: 145; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h 146; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 147; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h 148; CHECK-NEXT: mov h1, v0.h[2] 149; CHECK-NEXT: faddp h2, v0.2h 150; CHECK-NEXT: fadd h1, h2, h1 151; CHECK-NEXT: mov h0, v0.h[3] 152; CHECK-NEXT: fadd h0, h1, h0 153; CHECK-NEXT: ret 154; 155; CHECKNOFP16-LABEL: add_2H: 156; CHECKNOFP16: // %bb.0: 157; CHECKNOFP16-NEXT: mov h2, v1.h[1] 158; CHECKNOFP16-NEXT: mov h3, v0.h[1] 159; CHECKNOFP16-NEXT: mov h6, v1.h[2] 160; CHECKNOFP16-NEXT: mov h7, v0.h[2] 161; CHECKNOFP16-NEXT: mov h16, v1.h[3] 162; CHECKNOFP16-NEXT: mov h17, v0.h[3] 163; CHECKNOFP16-NEXT: fcvt s4, h1 164; CHECKNOFP16-NEXT: fcvt s5, h0 165; CHECKNOFP16-NEXT: fcvt s2, h2 166; CHECKNOFP16-NEXT: fcvt s3, h3 167; CHECKNOFP16-NEXT: fcvt s6, h6 168; CHECKNOFP16-NEXT: fcvt s7, h7 169; CHECKNOFP16-NEXT: fcvt s16, h16 170; CHECKNOFP16-NEXT: fcvt s17, h17 171; CHECKNOFP16-NEXT: fadd s4, s5, s4 172; CHECKNOFP16-NEXT: mov h5, v1.h[4] 173; CHECKNOFP16-NEXT: fadd s2, s3, s2 174; CHECKNOFP16-NEXT: mov h3, v0.h[4] 175; CHECKNOFP16-NEXT: fadd s6, s7, s6 176; CHECKNOFP16-NEXT: mov h7, v1.h[5] 177; CHECKNOFP16-NEXT: fadd s16, s17, s16 178; CHECKNOFP16-NEXT: mov h17, v0.h[5] 179; CHECKNOFP16-NEXT: fcvt s5, h5 180; CHECKNOFP16-NEXT: fcvt s3, h3 181; CHECKNOFP16-NEXT: fcvt s7, h7 182; CHECKNOFP16-NEXT: fcvt s17, h17 183; CHECKNOFP16-NEXT: fadd s3, s3, s5 184; CHECKNOFP16-NEXT: mov h5, v1.h[6] 185; CHECKNOFP16-NEXT: fadd s7, s17, s7 186; CHECKNOFP16-NEXT: mov h17, v0.h[6] 187; CHECKNOFP16-NEXT: mov h1, v1.h[7] 188; CHECKNOFP16-NEXT: mov h0, v0.h[7] 189; CHECKNOFP16-NEXT: fcvt s1, h1 190; CHECKNOFP16-NEXT: fcvt s0, h0 191; CHECKNOFP16-NEXT: fadd s0, s0, s1 192; CHECKNOFP16-NEXT: fcvt h1, s4 193; CHECKNOFP16-NEXT: fcvt h2, s2 194; CHECKNOFP16-NEXT: fcvt s1, h1 195; CHECKNOFP16-NEXT: fcvt s2, h2 196; CHECKNOFP16-NEXT: fadd s1, s1, s2 197; CHECKNOFP16-NEXT: fcvt h2, s6 198; CHECKNOFP16-NEXT: fcvt h1, s1 199; CHECKNOFP16-NEXT: fcvt s2, h2 200; CHECKNOFP16-NEXT: fcvt s1, h1 201; CHECKNOFP16-NEXT: fadd s1, s1, s2 202; CHECKNOFP16-NEXT: fcvt h2, s16 203; CHECKNOFP16-NEXT: fcvt h1, s1 204; CHECKNOFP16-NEXT: fcvt s2, h2 205; CHECKNOFP16-NEXT: fcvt s1, h1 206; CHECKNOFP16-NEXT: fadd s1, s1, s2 207; CHECKNOFP16-NEXT: fcvt h2, s3 208; CHECKNOFP16-NEXT: fcvt h1, s1 209; CHECKNOFP16-NEXT: fcvt s2, h2 210; CHECKNOFP16-NEXT: fcvt s1, h1 211; CHECKNOFP16-NEXT: fadd s1, s1, s2 212; CHECKNOFP16-NEXT: fcvt h3, s7 213; CHECKNOFP16-NEXT: fcvt h1, s1 214; CHECKNOFP16-NEXT: fcvt s5, h5 215; CHECKNOFP16-NEXT: fcvt s17, h17 216; CHECKNOFP16-NEXT: fcvt s3, h3 217; CHECKNOFP16-NEXT: fcvt s1, h1 218; CHECKNOFP16-NEXT: fadd s5, s17, s5 219; CHECKNOFP16-NEXT: fadd s1, s1, s3 220; CHECKNOFP16-NEXT: fcvt h4, s5 221; CHECKNOFP16-NEXT: fcvt h1, s1 222; CHECKNOFP16-NEXT: fcvt s4, h4 223; CHECKNOFP16-NEXT: fcvt s1, h1 224; CHECKNOFP16-NEXT: fadd s1, s1, s4 225; CHECKNOFP16-NEXT: fcvt h0, s0 226; CHECKNOFP16-NEXT: fcvt h1, s1 227; CHECKNOFP16-NEXT: fcvt s1, h1 228; CHECKNOFP16-NEXT: fcvt s0, h0 229; CHECKNOFP16-NEXT: fadd s0, s1, s0 230; CHECKNOFP16-NEXT: fcvt h0, s0 231; CHECKNOFP16-NEXT: ret 232 %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half -0.0, <16 x half> %bin.rdx) 233 ret half %r 234} 235 236define float @add_2S(<8 x float> %bin.rdx) { 237; CHECK-LABEL: add_2S: 238; CHECK: // %bb.0: 239; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s 240; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 241; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s 242; CHECK-NEXT: faddp s0, v0.2s 243; CHECK-NEXT: ret 244; 245; CHECKNOFP16-LABEL: add_2S: 246; CHECKNOFP16: // %bb.0: 247; CHECKNOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s 248; CHECKNOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 249; CHECKNOFP16-NEXT: fadd v0.2s, v0.2s, v1.2s 250; CHECKNOFP16-NEXT: faddp s0, v0.2s 251; CHECKNOFP16-NEXT: ret 252 %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx) 253 ret float %r 254} 255 256define double @add_2D(<4 x double> %bin.rdx) { 257; CHECK-LABEL: add_2D: 258; CHECK: // %bb.0: 259; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d 260; CHECK-NEXT: faddp d0, v0.2d 261; CHECK-NEXT: ret 262; 263; CHECKNOFP16-LABEL: add_2D: 264; CHECKNOFP16: // %bb.0: 265; CHECKNOFP16-NEXT: fadd v0.2d, v0.2d, v1.2d 266; CHECKNOFP16-NEXT: faddp d0, v0.2d 267; CHECKNOFP16-NEXT: ret 268 %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx) 269 ret double %r 270} 271 272; Function Attrs: nounwind readnone 273declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>) 274declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>) 275declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>) 276declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) 277declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 278declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 279declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) 280declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 281