1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; Tests for SSE2 and below, without SSE3+. 3; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s 4 5define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { 6; CHECK-LABEL: test1: 7; CHECK: ## BB#0: 8; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 9; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 10; CHECK-NEXT: movapd (%ecx), %xmm0 11; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0 12; CHECK-NEXT: movapd %xmm0, (%eax) 13; CHECK-NEXT: retl 14 %tmp3 = load <2 x double>, <2 x double>* %A, align 16 15 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 16 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 > 17 store <2 x double> %tmp9, <2 x double>* %r, align 16 18 ret void 19} 20 21define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { 22; CHECK-LABEL: test2: 23; CHECK: ## BB#0: 24; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 25; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 26; CHECK-NEXT: movapd (%ecx), %xmm0 27; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0 28; CHECK-NEXT: movapd %xmm0, (%eax) 29; CHECK-NEXT: retl 30 %tmp3 = load <2 x double>, <2 x double>* %A, align 16 31 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 32 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 > 33 store <2 x double> %tmp9, <2 x double>* %r, align 16 34 ret void 35} 36 37 38define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind { 39; CHECK-LABEL: test3: 40; CHECK: ## BB#0: 41; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 42; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 43; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx 44; CHECK-NEXT: movaps (%edx), %xmm0 45; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 46; CHECK-NEXT: movaps %xmm0, (%eax) 47; CHECK-NEXT: retl 48 %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2] 49 %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2] 50 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1] 51 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1] 52 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1] 53 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1] 54 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1] 55 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1] 56 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1] 57 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1] 58 store <4 x float> %tmp13, <4 x float>* %res 59 ret void 60} 61 62define void @test4(<4 x float> %X, <4 x float>* %res) nounwind { 63; CHECK-LABEL: test4: 64; CHECK: ## BB#0: 65; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 66; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] 67; CHECK-NEXT: movaps %xmm0, (%eax) 68; CHECK-NEXT: retl 69 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] 70 store <4 x float> %tmp5, <4 x float>* %res 71 ret void 72} 73 74define <4 x i32> @test5(i8** %ptr) nounwind { 75; CHECK-LABEL: test5: 76; CHECK: ## BB#0: 77; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 78; CHECK-NEXT: movl (%eax), %eax 79; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 80; CHECK-NEXT: pxor %xmm0, %xmm0 81; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 82; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 83; CHECK-NEXT: retl 84 %tmp = load i8*, i8** %ptr ; <i8*> [#uses=1] 85 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1] 86 %tmp.upgrd.2 = load float, float* %tmp.upgrd.1 ; <float> [#uses=1] 87 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1] 88 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] 89 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] 90 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] 91 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1] 92 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1] 93 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1] 94 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1] 95 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1] 96 ret <4 x i32> %tmp36 97} 98 99define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { 100; CHECK-LABEL: test6: 101; CHECK: ## BB#0: 102; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 103; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 104; CHECK-NEXT: movaps (%ecx), %xmm0 105; CHECK-NEXT: movaps %xmm0, (%eax) 106; CHECK-NEXT: retl 107 %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1] 108 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 109 store <4 x float> %tmp2, <4 x float>* %res 110 ret void 111} 112 113define void @test7() nounwind { 114; CHECK-LABEL: test7: 115; CHECK: ## BB#0: 116; CHECK-NEXT: xorps %xmm0, %xmm0 117; CHECK-NEXT: movaps %xmm0, 0 118; CHECK-NEXT: retl 119 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] 120 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] 121 store <4 x float> %2, <4 x float>* null 122 ret void 123} 124 125@x = external global [4 x i32] 126 127define <2 x i64> @test8() nounwind { 128; CHECK-LABEL: test8: 129; CHECK: ## BB#0: 130; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax 131; CHECK-NEXT: movups (%eax), %xmm0 132; CHECK-NEXT: retl 133 %tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1] 134 %tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1] 135 %tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1] 136 %tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1] 137 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1] 138 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1] 139 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1] 140 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1] 141 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] 142 ret <2 x i64> %tmp16 143} 144 145define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind { 146; CHECK-LABEL: test9: 147; CHECK: ## BB#0: 148; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0 149; CHECK-NEXT: retl 150 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] 151 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] 152 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] 153 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] 154 ret <4 x float> %tmp13 155} 156 157define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind { 158; CHECK-LABEL: test10: 159; CHECK: ## BB#0: 160; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 161; CHECK-NEXT: retl 162 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] 163 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] 164 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] 165 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] 166 ret <4 x float> %tmp13 167} 168 169define <2 x double> @test11(double %a, double %b) nounwind { 170; CHECK-LABEL: test11: 171; CHECK: ## BB#0: 172; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 173; CHECK-NEXT: retl 174 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1] 175 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1] 176 ret <2 x double> %tmp7 177} 178 179define void @test12() nounwind { 180; CHECK-LABEL: test12: 181; CHECK: ## BB#0: 182; CHECK-NEXT: movapd 0, %xmm0 183; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 184; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 185; CHECK-NEXT: xorpd %xmm2, %xmm2 186; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 187; CHECK-NEXT: addps %xmm1, %xmm0 188; CHECK-NEXT: movaps %xmm0, 0 189; CHECK-NEXT: retl 190 %tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2] 191 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 192 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 193 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1] 194 store <4 x float> %tmp4, <4 x float>* null 195 ret void 196} 197 198define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 199; CHECK-LABEL: test13: 200; CHECK: ## BB#0: 201; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 202; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 203; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx 204; CHECK-NEXT: movaps (%edx), %xmm0 205; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] 206; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 207; CHECK-NEXT: movaps %xmm0, (%eax) 208; CHECK-NEXT: retl 209 %tmp3 = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=1] 210 %tmp5 = load <4 x float>, <4 x float>* %C ; <<4 x float>> [#uses=1] 211 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] 212 store <4 x float> %tmp11, <4 x float>* %res 213 ret void 214} 215 216define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { 217; CHECK-LABEL: test14: 218; CHECK: ## BB#0: 219; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 220; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 221; CHECK-NEXT: movaps (%ecx), %xmm1 222; CHECK-NEXT: movaps (%eax), %xmm2 223; CHECK-NEXT: movaps %xmm2, %xmm0 224; CHECK-NEXT: addps %xmm1, %xmm0 225; CHECK-NEXT: subps %xmm1, %xmm2 226; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 227; CHECK-NEXT: retl 228 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2] 229 %tmp5 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=2] 230 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] 231 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] 232 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1] 233 ret <4 x float> %tmp27 234} 235 236define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { 237; CHECK-LABEL: test15: 238; CHECK: ## BB#0: ## %entry 239; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 240; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 241; CHECK-NEXT: movapd (%ecx), %xmm0 242; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] 243; CHECK-NEXT: retl 244entry: 245 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=1] 246 %tmp3 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=1] 247 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 248 ret <4 x float> %tmp4 249} 250 251; PR8900 252 253define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) { 254; CHECK-LABEL: test16: 255; CHECK: ## BB#0: 256; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 257; CHECK-NEXT: movapd 96(%eax), %xmm0 258; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 259; CHECK-NEXT: retl 260 %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3 261 %i6 = load <4 x double>, <4 x double>* %i5, align 32 262 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2> 263 ret <2 x double> %i7 264} 265 266; PR9009 267define fastcc void @test17() nounwind { 268; CHECK-LABEL: test17: 269; CHECK: ## BB#0: ## %entry 270; CHECK-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768> 271; CHECK-NEXT: movaps %xmm0, (%eax) 272; CHECK-NEXT: retl 273entry: 274 %0 = insertelement <4 x i32> undef, i32 undef, i32 1 275 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 276 %2 = bitcast <4 x i32> %1 to <4 x float> 277 store <4 x float> %2, <4 x float> * undef 278 ret void 279} 280 281; PR9210 282define <4 x float> @f(<4 x double>) nounwind { 283; CHECK-LABEL: f: 284; CHECK: ## BB#0: ## %entry 285; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 286; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 287; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 288; CHECK-NEXT: retl 289entry: 290 %double2float.i = fptrunc <4 x double> %0 to <4 x float> 291 ret <4 x float> %double2float.i 292} 293 294define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { 295; CHECK-LABEL: test_insert_64_zext: 296; CHECK: ## BB#0: 297; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 298; CHECK-NEXT: retl 299 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2> 300 ret <2 x i64> %1 301} 302 303define <4 x i32> @PR19721(<4 x i32> %i) { 304; CHECK-LABEL: PR19721: 305; CHECK: ## BB#0: 306; CHECK-NEXT: andps LCPI19_0, %xmm0 307; CHECK-NEXT: retl 308 %bc = bitcast <4 x i32> %i to i128 309 %insert = and i128 %bc, -4294967296 310 %bc2 = bitcast i128 %insert to <4 x i32> 311 ret <4 x i32> %bc2 312} 313 314define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { 315; CHECK-LABEL: test_mul: 316; CHECK: ## BB#0: 317; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 318; CHECK-NEXT: pmuludq %xmm1, %xmm0 319; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 320; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 321; CHECK-NEXT: pmuludq %xmm2, %xmm1 322; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 323; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 324; CHECK-NEXT: retl 325 %m = mul <4 x i32> %x, %y 326 ret <4 x i32> %m 327} 328