1; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
3
4define <4 x i32> @a(<4 x i32> %i) nounwind  {
5; SSE2-LABEL: a:
6; SSE2:       # BB#0: # %entry
7; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117]
8; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
9; SSE2-NEXT:    pmuludq %xmm1, %xmm0
10; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
11; SSE2-NEXT:    pmuludq %xmm1, %xmm2
12; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
13; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
14; SSE2-NEXT:    retq
15;
16; SSE41-LABEL: a:
17; SSE41:       # BB#0: # %entry
18; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
19; SSE41-NEXT:    retq
20entry:
21  %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
22  ret <4 x i32> %A
23}
24
25define <2 x i64> @b(<2 x i64> %i) nounwind  {
26; ALL-LABEL: b:
27; ALL:       # BB#0: # %entry
28; ALL-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
29; ALL-NEXT:    movdqa %xmm0, %xmm2
30; ALL-NEXT:    pmuludq %xmm1, %xmm2
31; ALL-NEXT:    pxor %xmm3, %xmm3
32; ALL-NEXT:    pmuludq %xmm0, %xmm3
33; ALL-NEXT:    psllq $32, %xmm3
34; ALL-NEXT:    paddq %xmm3, %xmm2
35; ALL-NEXT:    psrlq $32, %xmm0
36; ALL-NEXT:    pmuludq %xmm1, %xmm0
37; ALL-NEXT:    psllq $32, %xmm0
38; ALL-NEXT:    paddq %xmm2, %xmm0
39; ALL-NEXT:    retq
40entry:
41  %A = mul <2 x i64> %i, < i64 117, i64 117 >
42  ret <2 x i64> %A
43}
44
45define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind  {
46; SSE2-LABEL: c:
47; SSE2:       # BB#0: # %entry
48; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
49; SSE2-NEXT:    pmuludq %xmm1, %xmm0
50; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
51; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
52; SSE2-NEXT:    pmuludq %xmm2, %xmm1
53; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
54; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
55; SSE2-NEXT:    retq
56;
57; SSE41-LABEL: c:
58; SSE41:       # BB#0: # %entry
59; SSE41-NEXT:    pmulld %xmm1, %xmm0
60; SSE41-NEXT:    retq
61entry:
62  %A = mul <4 x i32> %i, %j
63  ret <4 x i32> %A
64}
65
66define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind  {
67; ALL-LABEL: d:
68; ALL:       # BB#0: # %entry
69; ALL-NEXT:    movdqa %xmm0, %xmm2
70; ALL-NEXT:    pmuludq %xmm1, %xmm2
71; ALL-NEXT:    movdqa %xmm1, %xmm3
72; ALL-NEXT:    psrlq $32, %xmm3
73; ALL-NEXT:    pmuludq %xmm0, %xmm3
74; ALL-NEXT:    psllq $32, %xmm3
75; ALL-NEXT:    paddq %xmm3, %xmm2
76; ALL-NEXT:    psrlq $32, %xmm0
77; ALL-NEXT:    pmuludq %xmm1, %xmm0
78; ALL-NEXT:    psllq $32, %xmm0
79; ALL-NEXT:    paddq %xmm2, %xmm0
80; ALL-NEXT:    retq
81entry:
82  %A = mul <2 x i64> %i, %j
83  ret <2 x i64> %A
84}
85
86declare void @foo()
87
88define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind  {
89; SSE2-LABEL: e:
90; SSE2:       # BB#0: # %entry
91; SSE2-NEXT:    subq $40, %rsp
92; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
93; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
94; SSE2-NEXT:    callq foo
95; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
96; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
97; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
98; SSE2-NEXT:    pmuludq %xmm2, %xmm0
99; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
100; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
101; SSE2-NEXT:    pmuludq %xmm1, %xmm2
102; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
103; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
104; SSE2-NEXT:    addq $40, %rsp
105; SSE2-NEXT:    retq
106;
107; SSE41-LABEL: e:
108; SSE41:       # BB#0: # %entry
109; SSE41-NEXT:    subq $40, %rsp
110; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
111; SSE41-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
112; SSE41-NEXT:    callq foo
113; SSE41-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
114; SSE41-NEXT:    pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
115; SSE41-NEXT:    addq $40, %rsp
116; SSE41-NEXT:    retq
117entry:
118  ; Use a call to force spills.
119  call void @foo()
120  %A = mul <4 x i32> %i, %j
121  ret <4 x i32> %A
122}
123
124define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind  {
125; ALL-LABEL: f:
126; ALL:       # BB#0: # %entry
127; ALL-NEXT:    subq $40, %rsp
128; ALL-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
129; ALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
130; ALL-NEXT:    callq foo
131; ALL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
132; ALL-NEXT:    movdqa %xmm0, %xmm2
133; ALL-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
134; ALL-NEXT:    pmuludq %xmm3, %xmm2
135; ALL-NEXT:    movdqa %xmm3, %xmm1
136; ALL-NEXT:    psrlq $32, %xmm1
137; ALL-NEXT:    pmuludq %xmm0, %xmm1
138; ALL-NEXT:    psllq $32, %xmm1
139; ALL-NEXT:    paddq %xmm1, %xmm2
140; ALL-NEXT:    psrlq $32, %xmm0
141; ALL-NEXT:    pmuludq %xmm3, %xmm0
142; ALL-NEXT:    psllq $32, %xmm0
143; ALL-NEXT:    paddq %xmm2, %xmm0
144; ALL-NEXT:    addq $40, %rsp
145; ALL-NEXT:    retq
146entry:
147  ; Use a call to force spills.
148  call void @foo()
149  %A = mul <2 x i64> %i, %j
150  ret <2 x i64> %A
151}
152