1; RUN: opt < %s -instcombine -S | FileCheck %s 2target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 3 4define i16 @test1(float %f) { 5entry: 6; CHECK-LABEL: @test1( 7; CHECK: fmul float 8; CHECK-NOT: insertelement {{.*}} 0.00 9; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul 10; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub 11; CHECK: ret 12 %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1] 13 %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] 14 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] 15 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] 16 %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 17 %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 18 %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 19 %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1] 20 %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1] 21 %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1] 22 ret i16 %tmp69 23} 24 25define i32 @test2(float %f) { 26; CHECK-LABEL: @test2( 27; CHECK-NOT: insertelement 28; CHECK-NOT: extractelement 29; CHECK: ret 30 %tmp5 = fmul float %f, %f 31 %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0 32 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1 33 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 34 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 35 %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32> 36 %tmp21 = extractelement <4 x i32> %tmp19, i32 0 37 ret i32 %tmp21 38} 39 40define i64 @test3(float %f, double %d) { 41; CHECK-LABEL: @test3( 42; CHECK-NOT: insertelement {{.*}} 0.00 43; CHECK: ret 44entry: 45 %v00 = insertelement <4 x float> undef, float %f, i32 0 46 %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1 47 %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2 48 %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3 49 %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03) 50 %v10 = insertelement <4 x float> undef, float %f, i32 0 51 %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1 52 %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2 53 %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3 54 %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13) 55 %v20 = insertelement <4 x float> undef, float %f, i32 0 56 %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1 57 %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2 58 %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3 59 %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23) 60 %v30 = insertelement <4 x float> undef, float %f, i32 0 61 %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1 62 %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2 63 %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3 64 %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33) 65 %v40 = insertelement <2 x double> undef, double %d, i32 0 66 %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1 67 %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41) 68 %v50 = insertelement <2 x double> undef, double %d, i32 0 69 %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1 70 %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51) 71 %v60 = insertelement <2 x double> undef, double %d, i32 0 72 %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1 73 %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61) 74 %v70 = insertelement <2 x double> undef, double %d, i32 0 75 %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1 76 %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71) 77 %tmp8 = add i32 %tmp0, %tmp2 78 %tmp9 = add i32 %tmp4, %tmp6 79 %tmp10 = add i32 %tmp8, %tmp9 80 %tmp11 = sext i32 %tmp10 to i64 81 %tmp12 = add i64 %tmp1, %tmp3 82 %tmp13 = add i64 %tmp5, %tmp7 83 %tmp14 = add i64 %tmp12, %tmp13 84 %tmp15 = add i64 %tmp11, %tmp14 85 ret i64 %tmp15 86} 87 88define void @get_image() nounwind { 89; CHECK-LABEL: @get_image( 90; CHECK-NOT: extractelement 91; CHECK: unreachable 92entry: 93 %0 = call i32 @fgetc(i8* null) nounwind ; <i32> [#uses=1] 94 %1 = trunc i32 %0 to i8 ; <i8> [#uses=1] 95 %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1 ; <<100 x i8>> [#uses=1] 96 %tmp1 = extractelement <100 x i8> %tmp2, i32 0 ; <i8> [#uses=1] 97 %2 = icmp eq i8 %tmp1, 80 ; <i1> [#uses=1] 98 br i1 %2, label %bb2, label %bb3 99 100bb2: ; preds = %entry 101 br label %bb3 102 103bb3: ; preds = %bb2, %entry 104 unreachable 105} 106 107; PR4340 108define void @vac(<4 x float>* nocapture %a) nounwind { 109; CHECK-LABEL: @vac( 110; CHECK-NOT: load 111; CHECK: ret 112entry: 113 %tmp1 = load <4 x float>, <4 x float>* %a ; <<4 x float>> [#uses=1] 114 %vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0 ; <<4 x float>> [#uses=1] 115 %vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1] 116 %vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1] 117 %vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1] 118 store <4 x float> %vecins8, <4 x float>* %a 119 ret void 120} 121 122declare i32 @fgetc(i8*) 123 124declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) 125 126declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) 127 128declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) 129 130declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) 131 132declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) 133declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) 134declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) 135declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) 136declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) 137declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) 138declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) 139declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) 140 141; <rdar://problem/6945110> 142define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind { 143entry: 144 %tmp = load <4 x i16>, <4 x i16>* %src 145 %tmp1 = load <8 x i16>, <8 x i16>* %foo 146; CHECK: %tmp2 = shufflevector 147 %tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 148; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle: 149; CHECK-NOT: shufflevector 150 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> 151; CHECK-NEXT: pmovzxwd 152 %0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3) 153 ret <4 x i32> %0 154} 155declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone 156 157define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind { 158entry: 159; CHECK-LABEL: define <4 x float> @dead_shuffle_elt( 160; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 161 %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 162 %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 163 ret <4 x float> %shuffle9.i 164} 165 166define <2 x float> @test_fptrunc(double %f) { 167; CHECK-LABEL: @test_fptrunc( 168; CHECK: insertelement 169; CHECK: insertelement 170; CHECK-NOT: insertelement 171 %tmp9 = insertelement <4 x double> undef, double %f, i32 0 172 %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1 173 %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2 174 %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3 175 %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float> 176 %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1> 177 ret <2 x float> %ret 178} 179 180define <2 x double> @test_fpext(float %f) { 181; CHECK-LABEL: @test_fpext( 182; CHECK: insertelement 183; CHECK: insertelement 184; CHECK-NOT: insertelement 185 %tmp9 = insertelement <4 x float> undef, float %f, i32 0 186 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1 187 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 188 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 189 %tmp5 = fpext <4 x float> %tmp12 to <4 x double> 190 %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1> 191 ret <2 x double> %ret 192} 193 194define <4 x float> @test_select(float %f, float %g) { 195; CHECK-LABEL: @test_select( 196; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0 197; CHECK-NOT: insertelement 198; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3 199; CHECK-NOT: insertelement 200; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef> 201 %a0 = insertelement <4 x float> undef, float %f, i32 0 202 %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1 203 %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2 204 %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3 205 %b0 = insertelement <4 x float> undef, float %g, i32 0 206 %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1 207 %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2 208 %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3 209 %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3 210 ret <4 x float> %ret 211} 212 213; We should optimize these two redundant insertqi into one 214; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) 215define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { 216; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) 217; CHECK-NOT: insertqi 218 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) 219 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) 220 ret <2 x i64> %2 221} 222 223; The result of this insert is the second arg, since the top 64 bits of 224; the result are undefined, and we copy the bottom 64 bits from the 225; second arg 226; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) 227define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { 228; CHECK: ret <2 x i64> %i 229 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) 230 ret <2 x i64> %1 231} 232 233; Test the several types of ranges and ordering that exist for two insertqi 234; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) 235define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { 236; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 237; CHECK: ret <2 x i64> %[[RES]] 238 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 239 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) 240 ret <2 x i64> %2 241} 242 243; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) 244define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { 245; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 246; CHECK: ret <2 x i64> %[[RES]] 247 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) 248 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 249 ret <2 x i64> %2 250} 251 252; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) 253define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { 254; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 255; CHECK: ret <2 x i64> %[[RES]] 256 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 257 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) 258 ret <2 x i64> %2 259} 260 261; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) 262define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { 263; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 264; CHECK: ret <2 x i64> %[[RES]] 265 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) 266 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 267 ret <2 x i64> %2 268} 269 270; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) 271define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { 272; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 273; CHECK: ret <2 x i64> %[[RES]] 274 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 275 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 276 ret <2 x i64> %2 277} 278 279; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) 280define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { 281; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 282; CHECK: ret <2 x i64> %[[RES]] 283 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) 284 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 285 ret <2 x i64> %2 286} 287 288; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) 289define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { 290; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 291; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 292 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 293 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 294 ret <2 x i64> %2 295} 296 297; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) 298define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { 299; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 300; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 301 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 302 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 303 ret <2 x i64> %2 304} 305 306; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) 307define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) { 308; CHECK: ret <2 x i64> %i 309 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0) 310 ret <2 x i64> %1 311} 312 313; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) 314define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) { 315; CHECK: ret <2 x i64> undef 316 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16) 317 ret <2 x i64> %1 318} 319 320; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) 321define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) { 322; CHECK: ret <2 x i64> undef 323 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32) 324 ret <2 x i64> %1 325} 326 327; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) 328define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) { 329; CHECK: ret <2 x i64> undef 330 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16) 331 ret <2 x i64> %1 332} 333 334; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi 335declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind 336 337declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) 338define <4 x float> @test_vpermilvar_ps(<4 x float> %v) { 339; CHECK-LABEL: @test_vpermilvar_ps( 340; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 341 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 342 ret <4 x float> %a 343} 344 345declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) 346define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) { 347; CHECK-LABEL: @test_vpermilvar_ps_256( 348; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 349 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>) 350 ret <8 x float> %a 351} 352 353declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) 354define <2 x double> @test_vpermilvar_pd(<2 x double> %v) { 355; CHECK-LABEL: @test_vpermilvar_pd( 356; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0> 357 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>) 358 ret <2 x double> %a 359} 360 361declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) 362define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) { 363; CHECK-LABEL: @test_vpermilvar_pd_256( 364; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 365 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>) 366 ret <4 x double> %a 367} 368 369define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) { 370; CHECK-LABEL: @test_vpermilvar_ps_zero( 371; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer 372 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer) 373 ret <4 x float> %a 374} 375 376define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) { 377; CHECK-LABEL: @test_vpermilvar_ps_256_zero( 378; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 379 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer) 380 ret <8 x float> %a 381} 382 383define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) { 384; CHECK-LABEL: @test_vpermilvar_pd_zero( 385; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer 386 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer) 387 ret <2 x double> %a 388} 389 390define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) { 391; CHECK-LABEL: @test_vpermilvar_pd_256_zero( 392; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 393 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer) 394 ret <4 x double> %a 395} 396 397define <2 x i64> @test_sse2_1() nounwind readnone uwtable { 398 %S = bitcast i32 1 to i32 399 %1 = zext i32 %S to i64 400 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 401 %3 = insertelement <2 x i64> %2, i64 0, i32 1 402 %4 = bitcast <2 x i64> %3 to <8 x i16> 403 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4) 404 %6 = bitcast <8 x i16> %5 to <4 x i32> 405 %7 = bitcast <2 x i64> %3 to <4 x i32> 406 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) 407 %9 = bitcast <4 x i32> %8 to <2 x i64> 408 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) 409 %11 = bitcast <2 x i64> %10 to <8 x i16> 410 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) 411 %13 = bitcast <8 x i16> %12 to <4 x i32> 412 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) 413 %15 = bitcast <4 x i32> %14 to <2 x i64> 414 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) 415 ret <2 x i64> %16 416; CHECK: test_sse2_1 417; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624> 418} 419 420define <4 x i64> @test_avx2_1() nounwind readnone uwtable { 421 %S = bitcast i32 1 to i32 422 %1 = zext i32 %S to i64 423 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 424 %3 = insertelement <2 x i64> %2, i64 0, i32 1 425 %4 = bitcast <2 x i64> %3 to <8 x i16> 426 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4) 427 %6 = bitcast <16 x i16> %5 to <8 x i32> 428 %7 = bitcast <2 x i64> %3 to <4 x i32> 429 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) 430 %9 = bitcast <8 x i32> %8 to <4 x i64> 431 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) 432 %11 = bitcast <4 x i64> %10 to <16 x i16> 433 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) 434 %13 = bitcast <16 x i16> %12 to <8 x i32> 435 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) 436 %15 = bitcast <8 x i32> %14 to <4 x i64> 437 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) 438 ret <4 x i64> %16 439; CHECK: test_avx2_1 440; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256> 441} 442 443define <2 x i64> @test_sse2_0() nounwind readnone uwtable { 444 %S = bitcast i32 128 to i32 445 %1 = zext i32 %S to i64 446 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 447 %3 = insertelement <2 x i64> %2, i64 0, i32 1 448 %4 = bitcast <2 x i64> %3 to <8 x i16> 449 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4) 450 %6 = bitcast <8 x i16> %5 to <4 x i32> 451 %7 = bitcast <2 x i64> %3 to <4 x i32> 452 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) 453 %9 = bitcast <4 x i32> %8 to <2 x i64> 454 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) 455 %11 = bitcast <2 x i64> %10 to <8 x i16> 456 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) 457 %13 = bitcast <8 x i16> %12 to <4 x i32> 458 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) 459 %15 = bitcast <4 x i32> %14 to <2 x i64> 460 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) 461 ret <2 x i64> %16 462; CHECK: test_sse2_0 463; CHECK: ret <2 x i64> zeroinitializer 464} 465 466define <4 x i64> @test_avx2_0() nounwind readnone uwtable { 467 %S = bitcast i32 128 to i32 468 %1 = zext i32 %S to i64 469 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 470 %3 = insertelement <2 x i64> %2, i64 0, i32 1 471 %4 = bitcast <2 x i64> %3 to <8 x i16> 472 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4) 473 %6 = bitcast <16 x i16> %5 to <8 x i32> 474 %7 = bitcast <2 x i64> %3 to <4 x i32> 475 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) 476 %9 = bitcast <8 x i32> %8 to <4 x i64> 477 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) 478 %11 = bitcast <4 x i64> %10 to <16 x i16> 479 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) 480 %13 = bitcast <16 x i16> %12 to <8 x i32> 481 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) 482 %15 = bitcast <8 x i32> %14 to <4 x i64> 483 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) 484 ret <4 x i64> %16 485; CHECK: test_avx2_0 486; CHECK: ret <4 x i64> zeroinitializer 487} 488define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable { 489 %S = bitcast i32 1 to i32 490 %1 = zext i32 %S to i64 491 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 492 %3 = insertelement <2 x i64> %2, i64 0, i32 1 493 %4 = bitcast <2 x i64> %3 to <8 x i16> 494 %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4) 495 %6 = bitcast <8 x i16> %5 to <4 x i32> 496 %7 = bitcast <2 x i64> %3 to <4 x i32> 497 %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) 498 %9 = bitcast <4 x i32> %8 to <2 x i64> 499 %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) 500 %11 = bitcast <2 x i64> %10 to <8 x i16> 501 %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) 502 %13 = bitcast <8 x i16> %12 to <4 x i32> 503 %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) 504 %15 = bitcast <4 x i32> %14 to <2 x i64> 505 %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) 506 ret <2 x i64> %16 507; CHECK: test_sse2_psrl_1 508; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020> 509} 510 511define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable { 512 %S = bitcast i32 1 to i32 513 %1 = zext i32 %S to i64 514 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 515 %3 = insertelement <2 x i64> %2, i64 0, i32 1 516 %4 = bitcast <2 x i64> %3 to <8 x i16> 517 %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4) 518 %6 = bitcast <16 x i16> %5 to <8 x i32> 519 %7 = bitcast <2 x i64> %3 to <4 x i32> 520 %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) 521 %9 = bitcast <8 x i32> %8 to <4 x i64> 522 %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) 523 %11 = bitcast <4 x i64> %10 to <16 x i16> 524 %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) 525 %13 = bitcast <16 x i16> %12 to <8 x i32> 526 %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) 527 %15 = bitcast <8 x i32> %14 to <4 x i64> 528 %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) 529 ret <4 x i64> %16 530; CHECK: test_avx2_psrl_1 531; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128> 532} 533 534define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable { 535 %S = bitcast i32 128 to i32 536 %1 = zext i32 %S to i64 537 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 538 %3 = insertelement <2 x i64> %2, i64 0, i32 1 539 %4 = bitcast <2 x i64> %3 to <8 x i16> 540 %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4) 541 %6 = bitcast <8 x i16> %5 to <4 x i32> 542 %7 = bitcast <2 x i64> %3 to <4 x i32> 543 %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) 544 %9 = bitcast <4 x i32> %8 to <2 x i64> 545 %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) 546 %11 = bitcast <2 x i64> %10 to <8 x i16> 547 %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) 548 %13 = bitcast <8 x i16> %12 to <4 x i32> 549 %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) 550 %15 = bitcast <4 x i32> %14 to <2 x i64> 551 %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) 552 ret <2 x i64> %16 553; CHECK: test_sse2_psrl_0 554; CHECK: ret <2 x i64> zeroinitializer 555} 556 557define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable { 558 %S = bitcast i32 128 to i32 559 %1 = zext i32 %S to i64 560 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 561 %3 = insertelement <2 x i64> %2, i64 0, i32 1 562 %4 = bitcast <2 x i64> %3 to <8 x i16> 563 %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4) 564 %6 = bitcast <16 x i16> %5 to <8 x i32> 565 %7 = bitcast <2 x i64> %3 to <4 x i32> 566 %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) 567 %9 = bitcast <8 x i32> %8 to <4 x i64> 568 %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) 569 %11 = bitcast <4 x i64> %10 to <16 x i16> 570 %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) 571 %13 = bitcast <16 x i16> %12 to <8 x i32> 572 %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) 573 %15 = bitcast <8 x i32> %14 to <4 x i64> 574 %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) 575 ret <4 x i64> %16 576; CHECK: test_avx2_psrl_0 577; CHECK: ret <4 x i64> zeroinitializer 578} 579 580declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 581declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 582declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 583declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 584declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 585declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 586declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 587declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 588declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 589declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 590declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 591declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 592declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 593declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 594declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 595declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1 596declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1 597declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1 598declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1 599declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1 600declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1 601declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 602declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 603declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 604 605attributes #1 = { nounwind readnone } 606