1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
4
5define i64 @t0(<1 x i64>* %a, i32* %b) nounwind {
6; X86-LABEL: t0:
7; X86:       # %bb.0: # %entry
8; X86-NEXT:    pushl %ebp
9; X86-NEXT:    movl %esp, %ebp
10; X86-NEXT:    andl $-8, %esp
11; X86-NEXT:    subl $8, %esp
12; X86-NEXT:    movl 12(%ebp), %eax
13; X86-NEXT:    movl 8(%ebp), %ecx
14; X86-NEXT:    movq (%ecx), %mm0
15; X86-NEXT:    movd (%eax), %mm1
16; X86-NEXT:    psllq %mm1, %mm0
17; X86-NEXT:    movq %mm0, (%esp)
18; X86-NEXT:    movl (%esp), %eax
19; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
20; X86-NEXT:    movl %ebp, %esp
21; X86-NEXT:    popl %ebp
22; X86-NEXT:    retl
23;
24; X64-LABEL: t0:
25; X64:       # %bb.0: # %entry
26; X64-NEXT:    movq (%rdi), %mm0
27; X64-NEXT:    movd (%rsi), %mm1
28; X64-NEXT:    psllq %mm1, %mm0
29; X64-NEXT:    movq %mm0, %rax
30; X64-NEXT:    retq
31entry:
32  %0 = bitcast <1 x i64>* %a to x86_mmx*
33  %1 = load x86_mmx, x86_mmx* %0, align 8
34  %2 = load i32, i32* %b, align 4
35  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %1, i32 %2)
36  %4 = bitcast x86_mmx %3 to i64
37  ret i64 %4
38}
39declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
40
41define i64 @t1(<1 x i64>* %a, i32* %b) nounwind {
42; X86-LABEL: t1:
43; X86:       # %bb.0: # %entry
44; X86-NEXT:    pushl %ebp
45; X86-NEXT:    movl %esp, %ebp
46; X86-NEXT:    andl $-8, %esp
47; X86-NEXT:    subl $8, %esp
48; X86-NEXT:    movl 12(%ebp), %eax
49; X86-NEXT:    movl 8(%ebp), %ecx
50; X86-NEXT:    movq (%ecx), %mm0
51; X86-NEXT:    movd (%eax), %mm1
52; X86-NEXT:    psrlq %mm1, %mm0
53; X86-NEXT:    movq %mm0, (%esp)
54; X86-NEXT:    movl (%esp), %eax
55; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
56; X86-NEXT:    movl %ebp, %esp
57; X86-NEXT:    popl %ebp
58; X86-NEXT:    retl
59;
60; X64-LABEL: t1:
61; X64:       # %bb.0: # %entry
62; X64-NEXT:    movq (%rdi), %mm0
63; X64-NEXT:    movd (%rsi), %mm1
64; X64-NEXT:    psrlq %mm1, %mm0
65; X64-NEXT:    movq %mm0, %rax
66; X64-NEXT:    retq
67entry:
68  %0 = bitcast <1 x i64>* %a to x86_mmx*
69  %1 = load x86_mmx, x86_mmx* %0, align 8
70  %2 = load i32, i32* %b, align 4
71  %3 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %1, i32 %2)
72  %4 = bitcast x86_mmx %3 to i64
73  ret i64 %4
74}
75declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32)
76
77define i64 @t2(<1 x i64>* %a, i32* %b) nounwind {
78; X86-LABEL: t2:
79; X86:       # %bb.0: # %entry
80; X86-NEXT:    pushl %ebp
81; X86-NEXT:    movl %esp, %ebp
82; X86-NEXT:    andl $-8, %esp
83; X86-NEXT:    subl $8, %esp
84; X86-NEXT:    movl 12(%ebp), %eax
85; X86-NEXT:    movl 8(%ebp), %ecx
86; X86-NEXT:    movq (%ecx), %mm0
87; X86-NEXT:    movd (%eax), %mm1
88; X86-NEXT:    psllw %mm1, %mm0
89; X86-NEXT:    movq %mm0, (%esp)
90; X86-NEXT:    movl (%esp), %eax
91; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
92; X86-NEXT:    movl %ebp, %esp
93; X86-NEXT:    popl %ebp
94; X86-NEXT:    retl
95;
96; X64-LABEL: t2:
97; X64:       # %bb.0: # %entry
98; X64-NEXT:    movq (%rdi), %mm0
99; X64-NEXT:    movd (%rsi), %mm1
100; X64-NEXT:    psllw %mm1, %mm0
101; X64-NEXT:    movq %mm0, %rax
102; X64-NEXT:    retq
103entry:
104  %0 = bitcast <1 x i64>* %a to x86_mmx*
105  %1 = load x86_mmx, x86_mmx* %0, align 8
106  %2 = load i32, i32* %b, align 4
107  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %1, i32 %2)
108  %4 = bitcast x86_mmx %3 to i64
109  ret i64 %4
110}
111declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32)
112
113define i64 @t3(<1 x i64>* %a, i32* %b) nounwind {
114; X86-LABEL: t3:
115; X86:       # %bb.0: # %entry
116; X86-NEXT:    pushl %ebp
117; X86-NEXT:    movl %esp, %ebp
118; X86-NEXT:    andl $-8, %esp
119; X86-NEXT:    subl $8, %esp
120; X86-NEXT:    movl 12(%ebp), %eax
121; X86-NEXT:    movl 8(%ebp), %ecx
122; X86-NEXT:    movq (%ecx), %mm0
123; X86-NEXT:    movd (%eax), %mm1
124; X86-NEXT:    psrlw %mm1, %mm0
125; X86-NEXT:    movq %mm0, (%esp)
126; X86-NEXT:    movl (%esp), %eax
127; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
128; X86-NEXT:    movl %ebp, %esp
129; X86-NEXT:    popl %ebp
130; X86-NEXT:    retl
131;
132; X64-LABEL: t3:
133; X64:       # %bb.0: # %entry
134; X64-NEXT:    movq (%rdi), %mm0
135; X64-NEXT:    movd (%rsi), %mm1
136; X64-NEXT:    psrlw %mm1, %mm0
137; X64-NEXT:    movq %mm0, %rax
138; X64-NEXT:    retq
139entry:
140  %0 = bitcast <1 x i64>* %a to x86_mmx*
141  %1 = load x86_mmx, x86_mmx* %0, align 8
142  %2 = load i32, i32* %b, align 4
143  %3 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %1, i32 %2)
144  %4 = bitcast x86_mmx %3 to i64
145  ret i64 %4
146}
147declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32)
148
149define i64 @t4(<1 x i64>* %a, i32* %b) nounwind {
150; X86-LABEL: t4:
151; X86:       # %bb.0: # %entry
152; X86-NEXT:    pushl %ebp
153; X86-NEXT:    movl %esp, %ebp
154; X86-NEXT:    andl $-8, %esp
155; X86-NEXT:    subl $8, %esp
156; X86-NEXT:    movl 12(%ebp), %eax
157; X86-NEXT:    movl 8(%ebp), %ecx
158; X86-NEXT:    movq (%ecx), %mm0
159; X86-NEXT:    movd (%eax), %mm1
160; X86-NEXT:    pslld %mm1, %mm0
161; X86-NEXT:    movq %mm0, (%esp)
162; X86-NEXT:    movl (%esp), %eax
163; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
164; X86-NEXT:    movl %ebp, %esp
165; X86-NEXT:    popl %ebp
166; X86-NEXT:    retl
167;
168; X64-LABEL: t4:
169; X64:       # %bb.0: # %entry
170; X64-NEXT:    movq (%rdi), %mm0
171; X64-NEXT:    movd (%rsi), %mm1
172; X64-NEXT:    pslld %mm1, %mm0
173; X64-NEXT:    movq %mm0, %rax
174; X64-NEXT:    retq
175entry:
176  %0 = bitcast <1 x i64>* %a to x86_mmx*
177  %1 = load x86_mmx, x86_mmx* %0, align 8
178  %2 = load i32, i32* %b, align 4
179  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %1, i32 %2)
180  %4 = bitcast x86_mmx %3 to i64
181  ret i64 %4
182}
183declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32)
184
185define i64 @t5(<1 x i64>* %a, i32* %b) nounwind {
186; X86-LABEL: t5:
187; X86:       # %bb.0: # %entry
188; X86-NEXT:    pushl %ebp
189; X86-NEXT:    movl %esp, %ebp
190; X86-NEXT:    andl $-8, %esp
191; X86-NEXT:    subl $8, %esp
192; X86-NEXT:    movl 12(%ebp), %eax
193; X86-NEXT:    movl 8(%ebp), %ecx
194; X86-NEXT:    movq (%ecx), %mm0
195; X86-NEXT:    movd (%eax), %mm1
196; X86-NEXT:    psrld %mm1, %mm0
197; X86-NEXT:    movq %mm0, (%esp)
198; X86-NEXT:    movl (%esp), %eax
199; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
200; X86-NEXT:    movl %ebp, %esp
201; X86-NEXT:    popl %ebp
202; X86-NEXT:    retl
203;
204; X64-LABEL: t5:
205; X64:       # %bb.0: # %entry
206; X64-NEXT:    movq (%rdi), %mm0
207; X64-NEXT:    movd (%rsi), %mm1
208; X64-NEXT:    psrld %mm1, %mm0
209; X64-NEXT:    movq %mm0, %rax
210; X64-NEXT:    retq
211entry:
212  %0 = bitcast <1 x i64>* %a to x86_mmx*
213  %1 = load x86_mmx, x86_mmx* %0, align 8
214  %2 = load i32, i32* %b, align 4
215  %3 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %1, i32 %2)
216  %4 = bitcast x86_mmx %3 to i64
217  ret i64 %4
218}
219declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32)
220
221define i64 @t6(<1 x i64>* %a, i32* %b) nounwind {
222; X86-LABEL: t6:
223; X86:       # %bb.0: # %entry
224; X86-NEXT:    pushl %ebp
225; X86-NEXT:    movl %esp, %ebp
226; X86-NEXT:    andl $-8, %esp
227; X86-NEXT:    subl $8, %esp
228; X86-NEXT:    movl 12(%ebp), %eax
229; X86-NEXT:    movl 8(%ebp), %ecx
230; X86-NEXT:    movq (%ecx), %mm0
231; X86-NEXT:    movd (%eax), %mm1
232; X86-NEXT:    psraw %mm1, %mm0
233; X86-NEXT:    movq %mm0, (%esp)
234; X86-NEXT:    movl (%esp), %eax
235; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
236; X86-NEXT:    movl %ebp, %esp
237; X86-NEXT:    popl %ebp
238; X86-NEXT:    retl
239;
240; X64-LABEL: t6:
241; X64:       # %bb.0: # %entry
242; X64-NEXT:    movq (%rdi), %mm0
243; X64-NEXT:    movd (%rsi), %mm1
244; X64-NEXT:    psraw %mm1, %mm0
245; X64-NEXT:    movq %mm0, %rax
246; X64-NEXT:    retq
247entry:
248  %0 = bitcast <1 x i64>* %a to x86_mmx*
249  %1 = load x86_mmx, x86_mmx* %0, align 8
250  %2 = load i32, i32* %b, align 4
251  %3 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %1, i32 %2)
252  %4 = bitcast x86_mmx %3 to i64
253  ret i64 %4
254}
255declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32)
256
257define i64 @t7(<1 x i64>* %a, i32* %b) nounwind {
258; X86-LABEL: t7:
259; X86:       # %bb.0: # %entry
260; X86-NEXT:    pushl %ebp
261; X86-NEXT:    movl %esp, %ebp
262; X86-NEXT:    andl $-8, %esp
263; X86-NEXT:    subl $8, %esp
264; X86-NEXT:    movl 12(%ebp), %eax
265; X86-NEXT:    movl 8(%ebp), %ecx
266; X86-NEXT:    movq (%ecx), %mm0
267; X86-NEXT:    movd (%eax), %mm1
268; X86-NEXT:    psrad %mm1, %mm0
269; X86-NEXT:    movq %mm0, (%esp)
270; X86-NEXT:    movl (%esp), %eax
271; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
272; X86-NEXT:    movl %ebp, %esp
273; X86-NEXT:    popl %ebp
274; X86-NEXT:    retl
275;
276; X64-LABEL: t7:
277; X64:       # %bb.0: # %entry
278; X64-NEXT:    movq (%rdi), %mm0
279; X64-NEXT:    movd (%rsi), %mm1
280; X64-NEXT:    psrad %mm1, %mm0
281; X64-NEXT:    movq %mm0, %rax
282; X64-NEXT:    retq
283entry:
284  %0 = bitcast <1 x i64>* %a to x86_mmx*
285  %1 = load x86_mmx, x86_mmx* %0, align 8
286  %2 = load i32, i32* %b, align 4
287  %3 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %1, i32 %2)
288  %4 = bitcast x86_mmx %3 to i64
289  ret i64 %4
290}
291declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
292
293define i64 @tt0(x86_mmx %t, x86_mmx* %q) nounwind {
294; X86-LABEL: tt0:
295; X86:       # %bb.0: # %entry
296; X86-NEXT:    pushl %ebp
297; X86-NEXT:    movl %esp, %ebp
298; X86-NEXT:    andl $-8, %esp
299; X86-NEXT:    subl $8, %esp
300; X86-NEXT:    movl 8(%ebp), %eax
301; X86-NEXT:    paddb (%eax), %mm0
302; X86-NEXT:    movq %mm0, (%esp)
303; X86-NEXT:    movl (%esp), %eax
304; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
305; X86-NEXT:    emms
306; X86-NEXT:    movl %ebp, %esp
307; X86-NEXT:    popl %ebp
308; X86-NEXT:    retl
309;
310; X64-LABEL: tt0:
311; X64:       # %bb.0: # %entry
312; X64-NEXT:    paddb (%rdi), %mm0
313; X64-NEXT:    movq %mm0, %rax
314; X64-NEXT:    emms
315; X64-NEXT:    retq
316entry:
317  %v = load x86_mmx, x86_mmx* %q
318  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %v)
319  %s = bitcast x86_mmx %u to i64
320  call void @llvm.x86.mmx.emms()
321  ret i64 %s
322}
323declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
324declare void @llvm.x86.mmx.emms()
325
326define i64 @tt1(x86_mmx %t, x86_mmx* %q) nounwind {
327; X86-LABEL: tt1:
328; X86:       # %bb.0: # %entry
329; X86-NEXT:    pushl %ebp
330; X86-NEXT:    movl %esp, %ebp
331; X86-NEXT:    andl $-8, %esp
332; X86-NEXT:    subl $8, %esp
333; X86-NEXT:    movl 8(%ebp), %eax
334; X86-NEXT:    paddw (%eax), %mm0
335; X86-NEXT:    movq %mm0, (%esp)
336; X86-NEXT:    movl (%esp), %eax
337; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
338; X86-NEXT:    emms
339; X86-NEXT:    movl %ebp, %esp
340; X86-NEXT:    popl %ebp
341; X86-NEXT:    retl
342;
343; X64-LABEL: tt1:
344; X64:       # %bb.0: # %entry
345; X64-NEXT:    paddw (%rdi), %mm0
346; X64-NEXT:    movq %mm0, %rax
347; X64-NEXT:    emms
348; X64-NEXT:    retq
349entry:
350  %v = load x86_mmx, x86_mmx* %q
351  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %v)
352  %s = bitcast x86_mmx %u to i64
353  call void @llvm.x86.mmx.emms()
354  ret i64 %s
355}
356declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
357
358define i64 @tt2(x86_mmx %t, x86_mmx* %q) nounwind {
359; X86-LABEL: tt2:
360; X86:       # %bb.0: # %entry
361; X86-NEXT:    pushl %ebp
362; X86-NEXT:    movl %esp, %ebp
363; X86-NEXT:    andl $-8, %esp
364; X86-NEXT:    subl $8, %esp
365; X86-NEXT:    movl 8(%ebp), %eax
366; X86-NEXT:    paddd (%eax), %mm0
367; X86-NEXT:    movq %mm0, (%esp)
368; X86-NEXT:    movl (%esp), %eax
369; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
370; X86-NEXT:    emms
371; X86-NEXT:    movl %ebp, %esp
372; X86-NEXT:    popl %ebp
373; X86-NEXT:    retl
374;
375; X64-LABEL: tt2:
376; X64:       # %bb.0: # %entry
377; X64-NEXT:    paddd (%rdi), %mm0
378; X64-NEXT:    movq %mm0, %rax
379; X64-NEXT:    emms
380; X64-NEXT:    retq
381entry:
382  %v = load x86_mmx, x86_mmx* %q
383  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %v)
384  %s = bitcast x86_mmx %u to i64
385  call void @llvm.x86.mmx.emms()
386  ret i64 %s
387}
388declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
389
390define i64 @tt3(x86_mmx %t, x86_mmx* %q) nounwind {
391; X86-LABEL: tt3:
392; X86:       # %bb.0: # %entry
393; X86-NEXT:    pushl %ebp
394; X86-NEXT:    movl %esp, %ebp
395; X86-NEXT:    andl $-8, %esp
396; X86-NEXT:    subl $8, %esp
397; X86-NEXT:    movl 8(%ebp), %eax
398; X86-NEXT:    paddq (%eax), %mm0
399; X86-NEXT:    movq %mm0, (%esp)
400; X86-NEXT:    movl (%esp), %eax
401; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
402; X86-NEXT:    emms
403; X86-NEXT:    movl %ebp, %esp
404; X86-NEXT:    popl %ebp
405; X86-NEXT:    retl
406;
407; X64-LABEL: tt3:
408; X64:       # %bb.0: # %entry
409; X64-NEXT:    paddq (%rdi), %mm0
410; X64-NEXT:    movq %mm0, %rax
411; X64-NEXT:    emms
412; X64-NEXT:    retq
413entry:
414  %v = load x86_mmx, x86_mmx* %q
415  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %v)
416  %s = bitcast x86_mmx %u to i64
417  call void @llvm.x86.mmx.emms()
418  ret i64 %s
419}
420declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
421
422define i64 @tt4(x86_mmx %t, x86_mmx* %q) nounwind {
423; X86-LABEL: tt4:
424; X86:       # %bb.0: # %entry
425; X86-NEXT:    pushl %ebp
426; X86-NEXT:    movl %esp, %ebp
427; X86-NEXT:    andl $-8, %esp
428; X86-NEXT:    subl $8, %esp
429; X86-NEXT:    movl 8(%ebp), %eax
430; X86-NEXT:    paddusb (%eax), %mm0
431; X86-NEXT:    movq %mm0, (%esp)
432; X86-NEXT:    movl (%esp), %eax
433; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
434; X86-NEXT:    emms
435; X86-NEXT:    movl %ebp, %esp
436; X86-NEXT:    popl %ebp
437; X86-NEXT:    retl
438;
439; X64-LABEL: tt4:
440; X64:       # %bb.0: # %entry
441; X64-NEXT:    paddusb (%rdi), %mm0
442; X64-NEXT:    movq %mm0, %rax
443; X64-NEXT:    emms
444; X64-NEXT:    retq
445entry:
446  %v = load x86_mmx, x86_mmx* %q
447  %u = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %t, x86_mmx %v)
448  %s = bitcast x86_mmx %u to i64
449  call void @llvm.x86.mmx.emms()
450  ret i64 %s
451}
452declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
453
454define i64 @tt5(x86_mmx %t, x86_mmx* %q) nounwind {
455; X86-LABEL: tt5:
456; X86:       # %bb.0: # %entry
457; X86-NEXT:    pushl %ebp
458; X86-NEXT:    movl %esp, %ebp
459; X86-NEXT:    andl $-8, %esp
460; X86-NEXT:    subl $8, %esp
461; X86-NEXT:    movl 8(%ebp), %eax
462; X86-NEXT:    paddusw (%eax), %mm0
463; X86-NEXT:    movq %mm0, (%esp)
464; X86-NEXT:    movl (%esp), %eax
465; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
466; X86-NEXT:    emms
467; X86-NEXT:    movl %ebp, %esp
468; X86-NEXT:    popl %ebp
469; X86-NEXT:    retl
470;
471; X64-LABEL: tt5:
472; X64:       # %bb.0: # %entry
473; X64-NEXT:    paddusw (%rdi), %mm0
474; X64-NEXT:    movq %mm0, %rax
475; X64-NEXT:    emms
476; X64-NEXT:    retq
477entry:
478  %v = load x86_mmx, x86_mmx* %q
479  %u = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %t, x86_mmx %v)
480  %s = bitcast x86_mmx %u to i64
481  call void @llvm.x86.mmx.emms()
482  ret i64 %s
483}
484declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
485
486define i64 @tt6(x86_mmx %t, x86_mmx* %q) nounwind {
487; X86-LABEL: tt6:
488; X86:       # %bb.0: # %entry
489; X86-NEXT:    pushl %ebp
490; X86-NEXT:    movl %esp, %ebp
491; X86-NEXT:    andl $-8, %esp
492; X86-NEXT:    subl $8, %esp
493; X86-NEXT:    movl 8(%ebp), %eax
494; X86-NEXT:    psrlw (%eax), %mm0
495; X86-NEXT:    movq %mm0, (%esp)
496; X86-NEXT:    movl (%esp), %eax
497; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
498; X86-NEXT:    emms
499; X86-NEXT:    movl %ebp, %esp
500; X86-NEXT:    popl %ebp
501; X86-NEXT:    retl
502;
503; X64-LABEL: tt6:
504; X64:       # %bb.0: # %entry
505; X64-NEXT:    psrlw (%rdi), %mm0
506; X64-NEXT:    movq %mm0, %rax
507; X64-NEXT:    emms
508; X64-NEXT:    retq
509entry:
510  %v = load x86_mmx, x86_mmx* %q
511  %u = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %t, x86_mmx %v)
512  %s = bitcast x86_mmx %u to i64
513  call void @llvm.x86.mmx.emms()
514  ret i64 %s
515}
516declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx)
517
518define i64 @tt7(x86_mmx %t, x86_mmx* %q) nounwind {
519; X86-LABEL: tt7:
520; X86:       # %bb.0: # %entry
521; X86-NEXT:    pushl %ebp
522; X86-NEXT:    movl %esp, %ebp
523; X86-NEXT:    andl $-8, %esp
524; X86-NEXT:    subl $8, %esp
525; X86-NEXT:    movl 8(%ebp), %eax
526; X86-NEXT:    psrld (%eax), %mm0
527; X86-NEXT:    movq %mm0, (%esp)
528; X86-NEXT:    movl (%esp), %eax
529; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
530; X86-NEXT:    emms
531; X86-NEXT:    movl %ebp, %esp
532; X86-NEXT:    popl %ebp
533; X86-NEXT:    retl
534;
535; X64-LABEL: tt7:
536; X64:       # %bb.0: # %entry
537; X64-NEXT:    psrld (%rdi), %mm0
538; X64-NEXT:    movq %mm0, %rax
539; X64-NEXT:    emms
540; X64-NEXT:    retq
541entry:
542  %v = load x86_mmx, x86_mmx* %q
543  %u = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %t, x86_mmx %v)
544  %s = bitcast x86_mmx %u to i64
545  call void @llvm.x86.mmx.emms()
546  ret i64 %s
547}
548declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx)
549
550define i64 @tt8(x86_mmx %t, x86_mmx* %q) nounwind {
551; X86-LABEL: tt8:
552; X86:       # %bb.0: # %entry
553; X86-NEXT:    pushl %ebp
554; X86-NEXT:    movl %esp, %ebp
555; X86-NEXT:    andl $-8, %esp
556; X86-NEXT:    subl $8, %esp
557; X86-NEXT:    movl 8(%ebp), %eax
558; X86-NEXT:    psrlq (%eax), %mm0
559; X86-NEXT:    movq %mm0, (%esp)
560; X86-NEXT:    movl (%esp), %eax
561; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
562; X86-NEXT:    emms
563; X86-NEXT:    movl %ebp, %esp
564; X86-NEXT:    popl %ebp
565; X86-NEXT:    retl
566;
567; X64-LABEL: tt8:
568; X64:       # %bb.0: # %entry
569; X64-NEXT:    psrlq (%rdi), %mm0
570; X64-NEXT:    movq %mm0, %rax
571; X64-NEXT:    emms
572; X64-NEXT:    retq
573entry:
574  %v = load x86_mmx, x86_mmx* %q
575  %u = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %t, x86_mmx %v)
576  %s = bitcast x86_mmx %u to i64
577  call void @llvm.x86.mmx.emms()
578  ret i64 %s
579}
580declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx)
581
582define void @test_psrlq_by_volatile_shift_amount(x86_mmx* %t) nounwind {
583; X86-LABEL: test_psrlq_by_volatile_shift_amount:
584; X86:       # %bb.0: # %entry
585; X86-NEXT:    pushl %eax
586; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
587; X86-NEXT:    movl $1, (%esp)
588; X86-NEXT:    movl $255, %ecx
589; X86-NEXT:    movd %ecx, %mm0
590; X86-NEXT:    movd (%esp), %mm1
591; X86-NEXT:    psrlq %mm1, %mm0
592; X86-NEXT:    movq %mm0, (%eax)
593; X86-NEXT:    popl %eax
594; X86-NEXT:    retl
595;
596; X64-LABEL: test_psrlq_by_volatile_shift_amount:
597; X64:       # %bb.0: # %entry
598; X64-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
599; X64-NEXT:    movl $255, %eax
600; X64-NEXT:    movd %eax, %mm0
601; X64-NEXT:    movd -{{[0-9]+}}(%rsp), %mm1
602; X64-NEXT:    psrlq %mm1, %mm0
603; X64-NEXT:    movq %mm0, (%rdi)
604; X64-NEXT:    retq
605entry:
606  %0 = alloca i32, align 4
607  %1 = bitcast i32* %0 to i8*
608  call void @llvm.lifetime.start(i64 4, i8* nonnull %1)
609  store volatile i32 1, i32* %0, align 4
610  %2 = load volatile i32, i32* %0, align 4
611  %3 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx bitcast (<1 x i64> <i64 255> to x86_mmx), i32 %2)
612  store x86_mmx %3, x86_mmx* %t, align 8
613  call void @llvm.lifetime.end(i64 4, i8* nonnull %1)
614  ret void
615}
616
617declare void @llvm.lifetime.start(i64, i8* nocapture)
618declare void @llvm.lifetime.end(i64, i8* nocapture)
619
620; Make sure we shrink this vector load and fold it.
621define x86_mmx @vec_load(<4 x float>* %x) {
622; X86-LABEL: vec_load:
623; X86:       # %bb.0:
624; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
625; X86-NEXT:    pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
626; X86-NEXT:    paddsb %mm0, %mm0
627; X86-NEXT:    retl
628;
629; X64-LABEL: vec_load:
630; X64:       # %bb.0:
631; X64-NEXT:    pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
632; X64-NEXT:    paddsb %mm0, %mm0
633; X64-NEXT:    movq2dq %mm0, %xmm0
634; X64-NEXT:    retq
635  %z = load <4 x float>, <4 x float>* %x
636  %y = extractelement <4 x float> %z, i32 0
637  %a = insertelement <2 x float> undef, float %y, i32 0
638  %b = insertelement <2 x float> %a, float %y, i32 1
639  %c = bitcast <2 x float> %b to x86_mmx
640  %d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
641  ret x86_mmx %d
642}
643
644declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
645
646