1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
6
7define <2 x i64> @test_mm_add_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
8; X32-LABEL: test_mm_add_epi8:
9; X32:       # BB#0:
10; X32-NEXT:    paddb %xmm1, %xmm0
11; X32-NEXT:    retl
12;
13; X64-LABEL: test_mm_add_epi8:
14; X64:       # BB#0:
15; X64-NEXT:    paddb %xmm1, %xmm0
16; X64-NEXT:    retq
17  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
18  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
19  %res = add <16 x i8> %arg0, %arg1
20  %bc = bitcast <16 x i8> %res to <2 x i64>
21  ret <2 x i64> %bc
22}
23
24define <2 x i64> @test_mm_add_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
25; X32-LABEL: test_mm_add_epi16:
26; X32:       # BB#0:
27; X32-NEXT:    paddw %xmm1, %xmm0
28; X32-NEXT:    retl
29;
30; X64-LABEL: test_mm_add_epi16:
31; X64:       # BB#0:
32; X64-NEXT:    paddw %xmm1, %xmm0
33; X64-NEXT:    retq
34  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
35  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
36  %res = add <8 x i16> %arg0, %arg1
37  %bc = bitcast <8 x i16> %res to <2 x i64>
38  ret <2 x i64> %bc
39}
40
41define <2 x i64> @test_mm_add_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
42; X32-LABEL: test_mm_add_epi32:
43; X32:       # BB#0:
44; X32-NEXT:    paddd %xmm1, %xmm0
45; X32-NEXT:    retl
46;
47; X64-LABEL: test_mm_add_epi32:
48; X64:       # BB#0:
49; X64-NEXT:    paddd %xmm1, %xmm0
50; X64-NEXT:    retq
51  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
52  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
53  %res = add <4 x i32> %arg0, %arg1
54  %bc = bitcast <4 x i32> %res to <2 x i64>
55  ret <2 x i64> %bc
56}
57
58define <2 x i64> @test_mm_add_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
59; X32-LABEL: test_mm_add_epi64:
60; X32:       # BB#0:
61; X32-NEXT:    paddq %xmm1, %xmm0
62; X32-NEXT:    retl
63;
64; X64-LABEL: test_mm_add_epi64:
65; X64:       # BB#0:
66; X64-NEXT:    paddq %xmm1, %xmm0
67; X64-NEXT:    retq
68  %res = add <2 x i64> %a0, %a1
69  ret <2 x i64> %res
70}
71
72define <2 x double> @test_mm_add_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
73; X32-LABEL: test_mm_add_pd:
74; X32:       # BB#0:
75; X32-NEXT:    addpd %xmm1, %xmm0
76; X32-NEXT:    retl
77;
78; X64-LABEL: test_mm_add_pd:
79; X64:       # BB#0:
80; X64-NEXT:    addpd %xmm1, %xmm0
81; X64-NEXT:    retq
82  %res = fadd <2 x double> %a0, %a1
83  ret <2 x double> %res
84}
85
86define <2 x double> @test_mm_add_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
87; X32-LABEL: test_mm_add_sd:
88; X32:       # BB#0:
89; X32-NEXT:    addsd %xmm1, %xmm0
90; X32-NEXT:    retl
91;
92; X64-LABEL: test_mm_add_sd:
93; X64:       # BB#0:
94; X64-NEXT:    addsd %xmm1, %xmm0
95; X64-NEXT:    retq
96  %ext0 = extractelement <2 x double> %a0, i32 0
97  %ext1 = extractelement <2 x double> %a1, i32 0
98  %fadd = fadd double %ext0, %ext1
99  %res = insertelement <2 x double> %a0, double %fadd, i32 0
100  ret <2 x double> %res
101}
102
103define <2 x i64> @test_mm_adds_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
104; X32-LABEL: test_mm_adds_epi8:
105; X32:       # BB#0:
106; X32-NEXT:    paddsb %xmm1, %xmm0
107; X32-NEXT:    retl
108;
109; X64-LABEL: test_mm_adds_epi8:
110; X64:       # BB#0:
111; X64-NEXT:    paddsb %xmm1, %xmm0
112; X64-NEXT:    retq
113  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
114  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
115  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1)
116  %bc = bitcast <16 x i8> %res to <2 x i64>
117  ret <2 x i64> %bc
118}
119declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
120
121define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
122; X32-LABEL: test_mm_adds_epi16:
123; X32:       # BB#0:
124; X32-NEXT:    paddsw %xmm1, %xmm0
125; X32-NEXT:    retl
126;
127; X64-LABEL: test_mm_adds_epi16:
128; X64:       # BB#0:
129; X64-NEXT:    paddsw %xmm1, %xmm0
130; X64-NEXT:    retq
131  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
132  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
133  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1)
134  %bc = bitcast <8 x i16> %res to <2 x i64>
135  ret <2 x i64> %bc
136}
137declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
138
139define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
140; X32-LABEL: test_mm_adds_epu8:
141; X32:       # BB#0:
142; X32-NEXT:    paddusb %xmm1, %xmm0
143; X32-NEXT:    retl
144;
145; X64-LABEL: test_mm_adds_epu8:
146; X64:       # BB#0:
147; X64-NEXT:    paddusb %xmm1, %xmm0
148; X64-NEXT:    retq
149  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
150  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
151  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1)
152  %bc = bitcast <16 x i8> %res to <2 x i64>
153  ret <2 x i64> %bc
154}
155declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
156
157define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
158; X32-LABEL: test_mm_adds_epu16:
159; X32:       # BB#0:
160; X32-NEXT:    paddusw %xmm1, %xmm0
161; X32-NEXT:    retl
162;
163; X64-LABEL: test_mm_adds_epu16:
164; X64:       # BB#0:
165; X64-NEXT:    paddusw %xmm1, %xmm0
166; X64-NEXT:    retq
167  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
168  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
169  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1)
170  %bc = bitcast <8 x i16> %res to <2 x i64>
171  ret <2 x i64> %bc
172}
173declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
174
175define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
176; X32-LABEL: test_mm_and_pd:
177; X32:       # BB#0:
178; X32-NEXT:    andps %xmm1, %xmm0
179; X32-NEXT:    retl
180;
181; X64-LABEL: test_mm_and_pd:
182; X64:       # BB#0:
183; X64-NEXT:    andps %xmm1, %xmm0
184; X64-NEXT:    retq
185  %arg0 = bitcast <2 x double> %a0 to <4 x i32>
186  %arg1 = bitcast <2 x double> %a1 to <4 x i32>
187  %res = and <4 x i32> %arg0, %arg1
188  %bc = bitcast <4 x i32> %res to <2 x double>
189  ret <2 x double> %bc
190}
191
192define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
193; X32-LABEL: test_mm_and_si128:
194; X32:       # BB#0:
195; X32-NEXT:    andps %xmm1, %xmm0
196; X32-NEXT:    retl
197;
198; X64-LABEL: test_mm_and_si128:
199; X64:       # BB#0:
200; X64-NEXT:    andps %xmm1, %xmm0
201; X64-NEXT:    retq
202  %res = and <2 x i64> %a0, %a1
203  ret <2 x i64> %res
204}
205
206define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
207; X32-LABEL: test_mm_andnot_pd:
208; X32:       # BB#0:
209; X32-NEXT:    andnps %xmm1, %xmm0
210; X32-NEXT:    retl
211;
212; X64-LABEL: test_mm_andnot_pd:
213; X64:       # BB#0:
214; X64-NEXT:    andnps %xmm1, %xmm0
215; X64-NEXT:    retq
216  %arg0 = bitcast <2 x double> %a0 to <4 x i32>
217  %arg1 = bitcast <2 x double> %a1 to <4 x i32>
218  %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
219  %res = and <4 x i32> %not, %arg1
220  %bc = bitcast <4 x i32> %res to <2 x double>
221  ret <2 x double> %bc
222}
223
224define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
225; X32-LABEL: test_mm_andnot_si128:
226; X32:       # BB#0:
227; X32-NEXT:    pcmpeqd %xmm2, %xmm2
228; X32-NEXT:    pxor %xmm2, %xmm0
229; X32-NEXT:    pand %xmm1, %xmm0
230; X32-NEXT:    retl
231;
232; X64-LABEL: test_mm_andnot_si128:
233; X64:       # BB#0:
234; X64-NEXT:    pcmpeqd %xmm2, %xmm2
235; X64-NEXT:    pxor %xmm2, %xmm0
236; X64-NEXT:    pand %xmm1, %xmm0
237; X64-NEXT:    retq
238  %not = xor <2 x i64> %a0, <i64 -1, i64 -1>
239  %res = and <2 x i64> %not, %a1
240  ret <2 x i64> %res
241}
242
243define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
244; X32-LABEL: test_mm_avg_epu8:
245; X32:       # BB#0:
246; X32-NEXT:    pavgb %xmm1, %xmm0
247; X32-NEXT:    retl
248;
249; X64-LABEL: test_mm_avg_epu8:
250; X64:       # BB#0:
251; X64-NEXT:    pavgb %xmm1, %xmm0
252; X64-NEXT:    retq
253  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
254  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
255  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1)
256  %bc = bitcast <16 x i8> %res to <2 x i64>
257  ret <2 x i64> %bc
258}
259declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
260
261define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
262; X32-LABEL: test_mm_avg_epu16:
263; X32:       # BB#0:
264; X32-NEXT:    pavgw %xmm1, %xmm0
265; X32-NEXT:    retl
266;
267; X64-LABEL: test_mm_avg_epu16:
268; X64:       # BB#0:
269; X64-NEXT:    pavgw %xmm1, %xmm0
270; X64-NEXT:    retq
271  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
272  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
273  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1)
274  %bc = bitcast <8 x i16> %res to <2 x i64>
275  ret <2 x i64> %bc
276}
277declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
278
279define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
280; X32-LABEL: test_mm_bslli_si128:
281; X32:       # BB#0:
282; X32-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
283; X32-NEXT:    retl
284;
285; X64-LABEL: test_mm_bslli_si128:
286; X64:       # BB#0:
287; X64-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
288; X64-NEXT:    retq
289  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
290  %res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
291  %bc = bitcast <16 x i8> %res to <2 x i64>
292  ret <2 x i64> %bc
293}
294
295define <2 x i64> @test_mm_bsrli_si128(<2 x i64> %a0) nounwind {
296; X32-LABEL: test_mm_bsrli_si128:
297; X32:       # BB#0:
298; X32-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
299; X32-NEXT:    retl
300;
301; X64-LABEL: test_mm_bsrli_si128:
302; X64:       # BB#0:
303; X64-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
304; X64-NEXT:    retq
305  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
306  %res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
307  %bc = bitcast <16 x i8> %res to <2 x i64>
308  ret <2 x i64> %bc
309}
310
311define <4 x float> @test_mm_castpd_ps(<2 x double> %a0) nounwind {
312; X32-LABEL: test_mm_castpd_ps:
313; X32:       # BB#0:
314; X32-NEXT:    retl
315;
316; X64-LABEL: test_mm_castpd_ps:
317; X64:       # BB#0:
318; X64-NEXT:    retq
319  %res = bitcast <2 x double> %a0 to <4 x float>
320  ret <4 x float> %res
321}
322
323define <2 x i64> @test_mm_castpd_si128(<2 x double> %a0) nounwind {
324; X32-LABEL: test_mm_castpd_si128:
325; X32:       # BB#0:
326; X32-NEXT:    retl
327;
328; X64-LABEL: test_mm_castpd_si128:
329; X64:       # BB#0:
330; X64-NEXT:    retq
331  %res = bitcast <2 x double> %a0 to <2 x i64>
332  ret <2 x i64> %res
333}
334
335define <2 x double> @test_mm_castps_pd(<4 x float> %a0) nounwind {
336; X32-LABEL: test_mm_castps_pd:
337; X32:       # BB#0:
338; X32-NEXT:    retl
339;
340; X64-LABEL: test_mm_castps_pd:
341; X64:       # BB#0:
342; X64-NEXT:    retq
343  %res = bitcast <4 x float> %a0 to <2 x double>
344  ret <2 x double> %res
345}
346
347define <2 x i64> @test_mm_castps_si128(<4 x float> %a0) nounwind {
348; X32-LABEL: test_mm_castps_si128:
349; X32:       # BB#0:
350; X32-NEXT:    retl
351;
352; X64-LABEL: test_mm_castps_si128:
353; X64:       # BB#0:
354; X64-NEXT:    retq
355  %res = bitcast <4 x float> %a0 to <2 x i64>
356  ret <2 x i64> %res
357}
358
359define <2 x double> @test_mm_castsi128_pd(<2 x i64> %a0) nounwind {
360; X32-LABEL: test_mm_castsi128_pd:
361; X32:       # BB#0:
362; X32-NEXT:    retl
363;
364; X64-LABEL: test_mm_castsi128_pd:
365; X64:       # BB#0:
366; X64-NEXT:    retq
367  %res = bitcast <2 x i64> %a0 to <2 x double>
368  ret <2 x double> %res
369}
370
371define <4 x float> @test_mm_castsi128_ps(<2 x i64> %a0) nounwind {
372; X32-LABEL: test_mm_castsi128_ps:
373; X32:       # BB#0:
374; X32-NEXT:    retl
375;
376; X64-LABEL: test_mm_castsi128_ps:
377; X64:       # BB#0:
378; X64-NEXT:    retq
379  %res = bitcast <2 x i64> %a0 to <4 x float>
380  ret <4 x float> %res
381}
382
383define void @test_mm_clflush(i8* %a0) nounwind {
384; X32-LABEL: test_mm_clflush:
385; X32:       # BB#0:
386; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
387; X32-NEXT:    clflush (%eax)
388; X32-NEXT:    retl
389;
390; X64-LABEL: test_mm_clflush:
391; X64:       # BB#0:
392; X64-NEXT:    clflush (%rdi)
393; X64-NEXT:    retq
394  call void @llvm.x86.sse2.clflush(i8* %a0)
395  ret void
396}
397declare void @llvm.x86.sse2.clflush(i8*) nounwind readnone
398
399define <2 x i64> @test_mm_cmpeq_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
400; X32-LABEL: test_mm_cmpeq_epi8:
401; X32:       # BB#0:
402; X32-NEXT:    pcmpeqb %xmm1, %xmm0
403; X32-NEXT:    retl
404;
405; X64-LABEL: test_mm_cmpeq_epi8:
406; X64:       # BB#0:
407; X64-NEXT:    pcmpeqb %xmm1, %xmm0
408; X64-NEXT:    retq
409  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
410  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
411  %cmp = icmp eq <16 x i8> %arg0, %arg1
412  %res = sext <16 x i1> %cmp to <16 x i8>
413  %bc = bitcast <16 x i8> %res to <2 x i64>
414  ret <2 x i64> %bc
415}
416
417define <2 x i64> @test_mm_cmpeq_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
418; X32-LABEL: test_mm_cmpeq_epi16:
419; X32:       # BB#0:
420; X32-NEXT:    pcmpeqw %xmm1, %xmm0
421; X32-NEXT:    retl
422;
423; X64-LABEL: test_mm_cmpeq_epi16:
424; X64:       # BB#0:
425; X64-NEXT:    pcmpeqw %xmm1, %xmm0
426; X64-NEXT:    retq
427  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
428  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
429  %cmp = icmp eq <8 x i16> %arg0, %arg1
430  %res = sext <8 x i1> %cmp to <8 x i16>
431  %bc = bitcast <8 x i16> %res to <2 x i64>
432  ret <2 x i64> %bc
433}
434
435define <2 x i64> @test_mm_cmpeq_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
436; X32-LABEL: test_mm_cmpeq_epi32:
437; X32:       # BB#0:
438; X32-NEXT:    pcmpeqd %xmm1, %xmm0
439; X32-NEXT:    retl
440;
441; X64-LABEL: test_mm_cmpeq_epi32:
442; X64:       # BB#0:
443; X64-NEXT:    pcmpeqd %xmm1, %xmm0
444; X64-NEXT:    retq
445  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
446  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
447  %cmp = icmp eq <4 x i32> %arg0, %arg1
448  %res = sext <4 x i1> %cmp to <4 x i32>
449  %bc = bitcast <4 x i32> %res to <2 x i64>
450  ret <2 x i64> %bc
451}
452
453define <2 x double> @test_mm_cmpeq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
454; X32-LABEL: test_mm_cmpeq_pd:
455; X32:       # BB#0:
456; X32-NEXT:    cmpeqpd %xmm1, %xmm0
457; X32-NEXT:    retl
458;
459; X64-LABEL: test_mm_cmpeq_pd:
460; X64:       # BB#0:
461; X64-NEXT:    cmpeqpd %xmm1, %xmm0
462; X64-NEXT:    retq
463  %fcmp = fcmp oeq <2 x double> %a0, %a1
464  %sext = sext <2 x i1> %fcmp to <2 x i64>
465  %res = bitcast <2 x i64> %sext to <2 x double>
466  ret <2 x double> %res
467}
468
469define <2 x double> @test_mm_cmpeq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
470; X32-LABEL: test_mm_cmpeq_sd:
471; X32:       # BB#0:
472; X32-NEXT:    cmpeqsd %xmm1, %xmm0
473; X32-NEXT:    retl
474;
475; X64-LABEL: test_mm_cmpeq_sd:
476; X64:       # BB#0:
477; X64-NEXT:    cmpeqsd %xmm1, %xmm0
478; X64-NEXT:    retq
479  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
480  ret <2 x double> %res
481}
482declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
483
484define <2 x double> @test_mm_cmpge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
485; X32-LABEL: test_mm_cmpge_pd:
486; X32:       # BB#0:
487; X32-NEXT:    cmplepd %xmm0, %xmm1
488; X32-NEXT:    movapd %xmm1, %xmm0
489; X32-NEXT:    retl
490;
491; X64-LABEL: test_mm_cmpge_pd:
492; X64:       # BB#0:
493; X64-NEXT:    cmplepd %xmm0, %xmm1
494; X64-NEXT:    movapd %xmm1, %xmm0
495; X64-NEXT:    retq
496  %fcmp = fcmp ole <2 x double> %a1, %a0
497  %sext = sext <2 x i1> %fcmp to <2 x i64>
498  %res = bitcast <2 x i64> %sext to <2 x double>
499  ret <2 x double> %res
500}
501
502define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
503; X32-LABEL: test_mm_cmpge_sd:
504; X32:       # BB#0:
505; X32-NEXT:    cmplesd %xmm0, %xmm1
506; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
507; X32-NEXT:    retl
508;
509; X64-LABEL: test_mm_cmpge_sd:
510; X64:       # BB#0:
511; X64-NEXT:    cmplesd %xmm0, %xmm1
512; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
513; X64-NEXT:    retq
514  %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
515  %ext0 = extractelement <2 x double> %cmp, i32 0
516  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
517  %ext1 = extractelement <2 x double> %a0, i32 1
518  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
519  ret <2 x double> %ins1
520}
521
522define <2 x i64> @test_mm_cmpgt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
523; X32-LABEL: test_mm_cmpgt_epi8:
524; X32:       # BB#0:
525; X32-NEXT:    pcmpgtb %xmm1, %xmm0
526; X32-NEXT:    retl
527;
528; X64-LABEL: test_mm_cmpgt_epi8:
529; X64:       # BB#0:
530; X64-NEXT:    pcmpgtb %xmm1, %xmm0
531; X64-NEXT:    retq
532  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
533  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
534  %cmp = icmp sgt <16 x i8> %arg0, %arg1
535  %res = sext <16 x i1> %cmp to <16 x i8>
536  %bc = bitcast <16 x i8> %res to <2 x i64>
537  ret <2 x i64> %bc
538}
539
540define <2 x i64> @test_mm_cmpgt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
541; X32-LABEL: test_mm_cmpgt_epi16:
542; X32:       # BB#0:
543; X32-NEXT:    pcmpgtw %xmm1, %xmm0
544; X32-NEXT:    retl
545;
546; X64-LABEL: test_mm_cmpgt_epi16:
547; X64:       # BB#0:
548; X64-NEXT:    pcmpgtw %xmm1, %xmm0
549; X64-NEXT:    retq
550  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
551  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
552  %cmp = icmp sgt <8 x i16> %arg0, %arg1
553  %res = sext <8 x i1> %cmp to <8 x i16>
554  %bc = bitcast <8 x i16> %res to <2 x i64>
555  ret <2 x i64> %bc
556}
557
558define <2 x i64> @test_mm_cmpgt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
559; X32-LABEL: test_mm_cmpgt_epi32:
560; X32:       # BB#0:
561; X32-NEXT:    pcmpgtd %xmm1, %xmm0
562; X32-NEXT:    retl
563;
564; X64-LABEL: test_mm_cmpgt_epi32:
565; X64:       # BB#0:
566; X64-NEXT:    pcmpgtd %xmm1, %xmm0
567; X64-NEXT:    retq
568  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
569  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
570  %cmp = icmp sgt <4 x i32> %arg0, %arg1
571  %res = sext <4 x i1> %cmp to <4 x i32>
572  %bc = bitcast <4 x i32> %res to <2 x i64>
573  ret <2 x i64> %bc
574}
575
576define <2 x double> @test_mm_cmpgt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
577; X32-LABEL: test_mm_cmpgt_pd:
578; X32:       # BB#0:
579; X32-NEXT:    cmpltpd %xmm0, %xmm1
580; X32-NEXT:    movapd %xmm1, %xmm0
581; X32-NEXT:    retl
582;
583; X64-LABEL: test_mm_cmpgt_pd:
584; X64:       # BB#0:
585; X64-NEXT:    cmpltpd %xmm0, %xmm1
586; X64-NEXT:    movapd %xmm1, %xmm0
587; X64-NEXT:    retq
588  %fcmp = fcmp olt <2 x double> %a1, %a0
589  %sext = sext <2 x i1> %fcmp to <2 x i64>
590  %res = bitcast <2 x i64> %sext to <2 x double>
591  ret <2 x double> %res
592}
593
594define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
595; X32-LABEL: test_mm_cmpgt_sd:
596; X32:       # BB#0:
597; X32-NEXT:    cmpltsd %xmm0, %xmm1
598; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
599; X32-NEXT:    retl
600;
601; X64-LABEL: test_mm_cmpgt_sd:
602; X64:       # BB#0:
603; X64-NEXT:    cmpltsd %xmm0, %xmm1
604; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
605; X64-NEXT:    retq
606  %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
607  %ext0 = extractelement <2 x double> %cmp, i32 0
608  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
609  %ext1 = extractelement <2 x double> %a0, i32 1
610  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
611  ret <2 x double> %ins1
612}
613
614define <2 x double> @test_mm_cmple_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
615; X32-LABEL: test_mm_cmple_pd:
616; X32:       # BB#0:
617; X32-NEXT:    cmplepd %xmm1, %xmm0
618; X32-NEXT:    retl
619;
620; X64-LABEL: test_mm_cmple_pd:
621; X64:       # BB#0:
622; X64-NEXT:    cmplepd %xmm1, %xmm0
623; X64-NEXT:    retq
624  %fcmp = fcmp ole <2 x double> %a0, %a1
625  %sext = sext <2 x i1> %fcmp to <2 x i64>
626  %res = bitcast <2 x i64> %sext to <2 x double>
627  ret <2 x double> %res
628}
629
630define <2 x double> @test_mm_cmple_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
631; X32-LABEL: test_mm_cmple_sd:
632; X32:       # BB#0:
633; X32-NEXT:    cmplesd %xmm1, %xmm0
634; X32-NEXT:    retl
635;
636; X64-LABEL: test_mm_cmple_sd:
637; X64:       # BB#0:
638; X64-NEXT:    cmplesd %xmm1, %xmm0
639; X64-NEXT:    retq
640  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 2)
641  ret <2 x double> %res
642}
643
644define <2 x i64> @test_mm_cmplt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
645; X32-LABEL: test_mm_cmplt_epi8:
646; X32:       # BB#0:
647; X32-NEXT:    pcmpgtb %xmm0, %xmm1
648; X32-NEXT:    movdqa %xmm1, %xmm0
649; X32-NEXT:    retl
650;
651; X64-LABEL: test_mm_cmplt_epi8:
652; X64:       # BB#0:
653; X64-NEXT:    pcmpgtb %xmm0, %xmm1
654; X64-NEXT:    movdqa %xmm1, %xmm0
655; X64-NEXT:    retq
656  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
657  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
658  %cmp = icmp sgt <16 x i8> %arg1, %arg0
659  %res = sext <16 x i1> %cmp to <16 x i8>
660  %bc = bitcast <16 x i8> %res to <2 x i64>
661  ret <2 x i64> %bc
662}
663
664define <2 x i64> @test_mm_cmplt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
665; X32-LABEL: test_mm_cmplt_epi16:
666; X32:       # BB#0:
667; X32-NEXT:    pcmpgtw %xmm0, %xmm1
668; X32-NEXT:    movdqa %xmm1, %xmm0
669; X32-NEXT:    retl
670;
671; X64-LABEL: test_mm_cmplt_epi16:
672; X64:       # BB#0:
673; X64-NEXT:    pcmpgtw %xmm0, %xmm1
674; X64-NEXT:    movdqa %xmm1, %xmm0
675; X64-NEXT:    retq
676  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
677  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
678  %cmp = icmp sgt <8 x i16> %arg1, %arg0
679  %res = sext <8 x i1> %cmp to <8 x i16>
680  %bc = bitcast <8 x i16> %res to <2 x i64>
681  ret <2 x i64> %bc
682}
683
684define <2 x i64> @test_mm_cmplt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
685; X32-LABEL: test_mm_cmplt_epi32:
686; X32:       # BB#0:
687; X32-NEXT:    pcmpgtd %xmm0, %xmm1
688; X32-NEXT:    movdqa %xmm1, %xmm0
689; X32-NEXT:    retl
690;
691; X64-LABEL: test_mm_cmplt_epi32:
692; X64:       # BB#0:
693; X64-NEXT:    pcmpgtd %xmm0, %xmm1
694; X64-NEXT:    movdqa %xmm1, %xmm0
695; X64-NEXT:    retq
696  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
697  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
698  %cmp = icmp sgt <4 x i32> %arg1, %arg0
699  %res = sext <4 x i1> %cmp to <4 x i32>
700  %bc = bitcast <4 x i32> %res to <2 x i64>
701  ret <2 x i64> %bc
702}
703
704define <2 x double> @test_mm_cmplt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
705; X32-LABEL: test_mm_cmplt_pd:
706; X32:       # BB#0:
707; X32-NEXT:    cmpltpd %xmm1, %xmm0
708; X32-NEXT:    retl
709;
710; X64-LABEL: test_mm_cmplt_pd:
711; X64:       # BB#0:
712; X64-NEXT:    cmpltpd %xmm1, %xmm0
713; X64-NEXT:    retq
714  %fcmp = fcmp olt <2 x double> %a0, %a1
715  %sext = sext <2 x i1> %fcmp to <2 x i64>
716  %res = bitcast <2 x i64> %sext to <2 x double>
717  ret <2 x double> %res
718}
719
720define <2 x double> @test_mm_cmplt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
721; X32-LABEL: test_mm_cmplt_sd:
722; X32:       # BB#0:
723; X32-NEXT:    cmpltsd %xmm1, %xmm0
724; X32-NEXT:    retl
725;
726; X64-LABEL: test_mm_cmplt_sd:
727; X64:       # BB#0:
728; X64-NEXT:    cmpltsd %xmm1, %xmm0
729; X64-NEXT:    retq
730  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 1)
731  ret <2 x double> %res
732}
733
734define <2 x double> @test_mm_cmpneq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
735; X32-LABEL: test_mm_cmpneq_pd:
736; X32:       # BB#0:
737; X32-NEXT:    cmpneqpd %xmm1, %xmm0
738; X32-NEXT:    retl
739;
740; X64-LABEL: test_mm_cmpneq_pd:
741; X64:       # BB#0:
742; X64-NEXT:    cmpneqpd %xmm1, %xmm0
743; X64-NEXT:    retq
744  %fcmp = fcmp une <2 x double> %a0, %a1
745  %sext = sext <2 x i1> %fcmp to <2 x i64>
746  %res = bitcast <2 x i64> %sext to <2 x double>
747  ret <2 x double> %res
748}
749
750define <2 x double> @test_mm_cmpneq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
751; X32-LABEL: test_mm_cmpneq_sd:
752; X32:       # BB#0:
753; X32-NEXT:    cmpneqsd %xmm1, %xmm0
754; X32-NEXT:    retl
755;
756; X64-LABEL: test_mm_cmpneq_sd:
757; X64:       # BB#0:
758; X64-NEXT:    cmpneqsd %xmm1, %xmm0
759; X64-NEXT:    retq
760  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 4)
761  ret <2 x double> %res
762}
763
764define <2 x double> @test_mm_cmpnge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
765; X32-LABEL: test_mm_cmpnge_pd:
766; X32:       # BB#0:
767; X32-NEXT:    cmpnlepd %xmm0, %xmm1
768; X32-NEXT:    movapd %xmm1, %xmm0
769; X32-NEXT:    retl
770;
771; X64-LABEL: test_mm_cmpnge_pd:
772; X64:       # BB#0:
773; X64-NEXT:    cmpnlepd %xmm0, %xmm1
774; X64-NEXT:    movapd %xmm1, %xmm0
775; X64-NEXT:    retq
776  %fcmp = fcmp ugt <2 x double> %a1, %a0
777  %sext = sext <2 x i1> %fcmp to <2 x i64>
778  %res = bitcast <2 x i64> %sext to <2 x double>
779  ret <2 x double> %res
780}
781
782define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
783; X32-LABEL: test_mm_cmpnge_sd:
784; X32:       # BB#0:
785; X32-NEXT:    cmpnlesd %xmm0, %xmm1
786; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
787; X32-NEXT:    retl
788;
789; X64-LABEL: test_mm_cmpnge_sd:
790; X64:       # BB#0:
791; X64-NEXT:    cmpnlesd %xmm0, %xmm1
792; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
793; X64-NEXT:    retq
794  %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
795  %ext0 = extractelement <2 x double> %cmp, i32 0
796  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
797  %ext1 = extractelement <2 x double> %a0, i32 1
798  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
799  ret <2 x double> %ins1
800}
801
802define <2 x double> @test_mm_cmpngt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
803; X32-LABEL: test_mm_cmpngt_pd:
804; X32:       # BB#0:
805; X32-NEXT:    cmpnltpd %xmm0, %xmm1
806; X32-NEXT:    movapd %xmm1, %xmm0
807; X32-NEXT:    retl
808;
809; X64-LABEL: test_mm_cmpngt_pd:
810; X64:       # BB#0:
811; X64-NEXT:    cmpnltpd %xmm0, %xmm1
812; X64-NEXT:    movapd %xmm1, %xmm0
813; X64-NEXT:    retq
814  %fcmp = fcmp uge <2 x double> %a1, %a0
815  %sext = sext <2 x i1> %fcmp to <2 x i64>
816  %res = bitcast <2 x i64> %sext to <2 x double>
817  ret <2 x double> %res
818}
819
820define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
821; X32-LABEL: test_mm_cmpngt_sd:
822; X32:       # BB#0:
823; X32-NEXT:    cmpnltsd %xmm0, %xmm1
824; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
825; X32-NEXT:    retl
826;
827; X64-LABEL: test_mm_cmpngt_sd:
828; X64:       # BB#0:
829; X64-NEXT:    cmpnltsd %xmm0, %xmm1
830; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
831; X64-NEXT:    retq
832  %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
833  %ext0 = extractelement <2 x double> %cmp, i32 0
834  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
835  %ext1 = extractelement <2 x double> %a0, i32 1
836  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
837  ret <2 x double> %ins1
838}
839
840define <2 x double> @test_mm_cmpnle_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
841; X32-LABEL: test_mm_cmpnle_pd:
842; X32:       # BB#0:
843; X32-NEXT:    cmpnlepd %xmm1, %xmm0
844; X32-NEXT:    retl
845;
846; X64-LABEL: test_mm_cmpnle_pd:
847; X64:       # BB#0:
848; X64-NEXT:    cmpnlepd %xmm1, %xmm0
849; X64-NEXT:    retq
850  %fcmp = fcmp ugt <2 x double> %a0, %a1
851  %sext = sext <2 x i1> %fcmp to <2 x i64>
852  %res = bitcast <2 x i64> %sext to <2 x double>
853  ret <2 x double> %res
854}
855
856define <2 x double> @test_mm_cmpnle_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
857; X32-LABEL: test_mm_cmpnle_sd:
858; X32:       # BB#0:
859; X32-NEXT:    cmpnlesd %xmm1, %xmm0
860; X32-NEXT:    retl
861;
862; X64-LABEL: test_mm_cmpnle_sd:
863; X64:       # BB#0:
864; X64-NEXT:    cmpnlesd %xmm1, %xmm0
865; X64-NEXT:    retq
866  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 6)
867  ret <2 x double> %res
868}
869
870define <2 x double> @test_mm_cmpnlt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
871; X32-LABEL: test_mm_cmpnlt_pd:
872; X32:       # BB#0:
873; X32-NEXT:    cmpnltpd %xmm1, %xmm0
874; X32-NEXT:    retl
875;
876; X64-LABEL: test_mm_cmpnlt_pd:
877; X64:       # BB#0:
878; X64-NEXT:    cmpnltpd %xmm1, %xmm0
879; X64-NEXT:    retq
880  %fcmp = fcmp uge <2 x double> %a0, %a1
881  %sext = sext <2 x i1> %fcmp to <2 x i64>
882  %res = bitcast <2 x i64> %sext to <2 x double>
883  ret <2 x double> %res
884}
885
886define <2 x double> @test_mm_cmpnlt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
887; X32-LABEL: test_mm_cmpnlt_sd:
888; X32:       # BB#0:
889; X32-NEXT:    cmpnltsd %xmm1, %xmm0
890; X32-NEXT:    retl
891;
892; X64-LABEL: test_mm_cmpnlt_sd:
893; X64:       # BB#0:
894; X64-NEXT:    cmpnltsd %xmm1, %xmm0
895; X64-NEXT:    retq
896  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 5)
897  ret <2 x double> %res
898}
899
900define <2 x double> @test_mm_cmpord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
901; X32-LABEL: test_mm_cmpord_pd:
902; X32:       # BB#0:
903; X32-NEXT:    cmpordpd %xmm1, %xmm0
904; X32-NEXT:    retl
905;
906; X64-LABEL: test_mm_cmpord_pd:
907; X64:       # BB#0:
908; X64-NEXT:    cmpordpd %xmm1, %xmm0
909; X64-NEXT:    retq
910  %fcmp = fcmp ord <2 x double> %a0, %a1
911  %sext = sext <2 x i1> %fcmp to <2 x i64>
912  %res = bitcast <2 x i64> %sext to <2 x double>
913  ret <2 x double> %res
914}
915
916define <2 x double> @test_mm_cmpord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
917; X32-LABEL: test_mm_cmpord_sd:
918; X32:       # BB#0:
919; X32-NEXT:    cmpordsd %xmm1, %xmm0
920; X32-NEXT:    retl
921;
922; X64-LABEL: test_mm_cmpord_sd:
923; X64:       # BB#0:
924; X64-NEXT:    cmpordsd %xmm1, %xmm0
925; X64-NEXT:    retq
926  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7)
927  ret <2 x double> %res
928}
929
930define <2 x double> @test_mm_cmpunord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
931; X32-LABEL: test_mm_cmpunord_pd:
932; X32:       # BB#0:
933; X32-NEXT:    cmpunordpd %xmm1, %xmm0
934; X32-NEXT:    retl
935;
936; X64-LABEL: test_mm_cmpunord_pd:
937; X64:       # BB#0:
938; X64-NEXT:    cmpunordpd %xmm1, %xmm0
939; X64-NEXT:    retq
940  %fcmp = fcmp uno <2 x double> %a0, %a1
941  %sext = sext <2 x i1> %fcmp to <2 x i64>
942  %res = bitcast <2 x i64> %sext to <2 x double>
943  ret <2 x double> %res
944}
945
946define <2 x double> @test_mm_cmpunord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
947; X32-LABEL: test_mm_cmpunord_sd:
948; X32:       # BB#0:
949; X32-NEXT:    cmpunordsd %xmm1, %xmm0
950; X32-NEXT:    retl
951;
952; X64-LABEL: test_mm_cmpunord_sd:
953; X64:       # BB#0:
954; X64-NEXT:    cmpunordsd %xmm1, %xmm0
955; X64-NEXT:    retq
956  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 3)
957  ret <2 x double> %res
958}
959
960define i32 @test_mm_comieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
961; X32-LABEL: test_mm_comieq_sd:
962; X32:       # BB#0:
963; X32-NEXT:    comisd %xmm1, %xmm0
964; X32-NEXT:    setnp %al
965; X32-NEXT:    sete %cl
966; X32-NEXT:    andb %al, %cl
967; X32-NEXT:    movzbl %cl, %eax
968; X32-NEXT:    retl
969;
970; X64-LABEL: test_mm_comieq_sd:
971; X64:       # BB#0:
972; X64-NEXT:    comisd %xmm1, %xmm0
973; X64-NEXT:    setnp %al
974; X64-NEXT:    sete %cl
975; X64-NEXT:    andb %al, %cl
976; X64-NEXT:    movzbl %cl, %eax
977; X64-NEXT:    retq
978  %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
979  ret i32 %res
980}
981declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
982
983define i32 @test_mm_comige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
984; X32-LABEL: test_mm_comige_sd:
985; X32:       # BB#0:
986; X32-NEXT:    xorl %eax, %eax
987; X32-NEXT:    comisd %xmm1, %xmm0
988; X32-NEXT:    setae %al
989; X32-NEXT:    retl
990;
991; X64-LABEL: test_mm_comige_sd:
992; X64:       # BB#0:
993; X64-NEXT:    xorl %eax, %eax
994; X64-NEXT:    comisd %xmm1, %xmm0
995; X64-NEXT:    setae %al
996; X64-NEXT:    retq
997  %res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1)
998  ret i32 %res
999}
1000declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readnone
1001
1002define i32 @test_mm_comigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1003; X32-LABEL: test_mm_comigt_sd:
1004; X32:       # BB#0:
1005; X32-NEXT:    xorl %eax, %eax
1006; X32-NEXT:    comisd %xmm1, %xmm0
1007; X32-NEXT:    seta %al
1008; X32-NEXT:    retl
1009;
1010; X64-LABEL: test_mm_comigt_sd:
1011; X64:       # BB#0:
1012; X64-NEXT:    xorl %eax, %eax
1013; X64-NEXT:    comisd %xmm1, %xmm0
1014; X64-NEXT:    seta %al
1015; X64-NEXT:    retq
1016  %res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1)
1017  ret i32 %res
1018}
1019declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readnone
1020
1021define i32 @test_mm_comile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1022; X32-LABEL: test_mm_comile_sd:
1023; X32:       # BB#0:
1024; X32-NEXT:    xorl %eax, %eax
1025; X32-NEXT:    comisd %xmm0, %xmm1
1026; X32-NEXT:    setae %al
1027; X32-NEXT:    retl
1028;
1029; X64-LABEL: test_mm_comile_sd:
1030; X64:       # BB#0:
1031; X64-NEXT:    xorl %eax, %eax
1032; X64-NEXT:    comisd %xmm0, %xmm1
1033; X64-NEXT:    setae %al
1034; X64-NEXT:    retq
1035  %res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1)
1036  ret i32 %res
1037}
1038declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readnone
1039
1040define i32 @test_mm_comilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1041; X32-LABEL: test_mm_comilt_sd:
1042; X32:       # BB#0:
1043; X32-NEXT:    xorl %eax, %eax
1044; X32-NEXT:    comisd %xmm0, %xmm1
1045; X32-NEXT:    seta %al
1046; X32-NEXT:    retl
1047;
1048; X64-LABEL: test_mm_comilt_sd:
1049; X64:       # BB#0:
1050; X64-NEXT:    xorl %eax, %eax
1051; X64-NEXT:    comisd %xmm0, %xmm1
1052; X64-NEXT:    seta %al
1053; X64-NEXT:    retq
1054  %res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1)
1055  ret i32 %res
1056}
1057declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readnone
1058
1059define i32 @test_mm_comineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1060; X32-LABEL: test_mm_comineq_sd:
1061; X32:       # BB#0:
1062; X32-NEXT:    comisd %xmm1, %xmm0
1063; X32-NEXT:    setp %al
1064; X32-NEXT:    setne %cl
1065; X32-NEXT:    orb %al, %cl
1066; X32-NEXT:    movzbl %cl, %eax
1067; X32-NEXT:    retl
1068;
1069; X64-LABEL: test_mm_comineq_sd:
1070; X64:       # BB#0:
1071; X64-NEXT:    comisd %xmm1, %xmm0
1072; X64-NEXT:    setp %al
1073; X64-NEXT:    setne %cl
1074; X64-NEXT:    orb %al, %cl
1075; X64-NEXT:    movzbl %cl, %eax
1076; X64-NEXT:    retq
1077  %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1)
1078  ret i32 %res
1079}
1080declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
1081
1082define <2 x double> @test_mm_cvtepi32_pd(<2 x i64> %a0) nounwind {
1083; X32-LABEL: test_mm_cvtepi32_pd:
1084; X32:       # BB#0:
1085; X32-NEXT:    cvtdq2pd %xmm0, %xmm0
1086; X32-NEXT:    retl
1087;
1088; X64-LABEL: test_mm_cvtepi32_pd:
1089; X64:       # BB#0:
1090; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
1091; X64-NEXT:    retq
1092  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1093  %ext = shufflevector <4 x i32> %arg0, <4 x i32> %arg0, <2 x i32> <i32 0, i32 1>
1094  %res = sitofp <2 x i32> %ext to <2 x double>
1095  ret <2 x double> %res
1096}
1097
1098define <4 x float> @test_mm_cvtepi32_ps(<2 x i64> %a0) nounwind {
1099; X32-LABEL: test_mm_cvtepi32_ps:
1100; X32:       # BB#0:
1101; X32-NEXT:    cvtdq2ps %xmm0, %xmm0
1102; X32-NEXT:    retl
1103;
1104; X64-LABEL: test_mm_cvtepi32_ps:
1105; X64:       # BB#0:
1106; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
1107; X64-NEXT:    retq
1108  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1109  %res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %arg0)
1110  ret <4 x float> %res
1111}
1112declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
1113
1114define <2 x i64> @test_mm_cvtpd_epi32(<2 x double> %a0) nounwind {
1115; X32-LABEL: test_mm_cvtpd_epi32:
1116; X32:       # BB#0:
1117; X32-NEXT:    cvtpd2dq %xmm0, %xmm0
1118; X32-NEXT:    retl
1119;
1120; X64-LABEL: test_mm_cvtpd_epi32:
1121; X64:       # BB#0:
1122; X64-NEXT:    cvtpd2dq %xmm0, %xmm0
1123; X64-NEXT:    retq
1124  %res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
1125  %bc = bitcast <4 x i32> %res to <2 x i64>
1126  ret <2 x i64> %bc
1127}
1128declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
1129
1130define <4 x float> @test_mm_cvtpd_ps(<2 x double> %a0) nounwind {
1131; X32-LABEL: test_mm_cvtpd_ps:
1132; X32:       # BB#0:
1133; X32-NEXT:    cvtpd2ps %xmm0, %xmm0
1134; X32-NEXT:    retl
1135;
1136; X64-LABEL: test_mm_cvtpd_ps:
1137; X64:       # BB#0:
1138; X64-NEXT:    cvtpd2ps %xmm0, %xmm0
1139; X64-NEXT:    retq
1140  %res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
1141  ret <4 x float> %res
1142}
1143declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
1144
1145define <2 x i64> @test_mm_cvtps_epi32(<4 x float> %a0) nounwind {
1146; X32-LABEL: test_mm_cvtps_epi32:
1147; X32:       # BB#0:
1148; X32-NEXT:    cvtps2dq %xmm0, %xmm0
1149; X32-NEXT:    retl
1150;
1151; X64-LABEL: test_mm_cvtps_epi32:
1152; X64:       # BB#0:
1153; X64-NEXT:    cvtps2dq %xmm0, %xmm0
1154; X64-NEXT:    retq
1155  %res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
1156  %bc = bitcast <4 x i32> %res to <2 x i64>
1157  ret <2 x i64> %bc
1158}
1159declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
1160
1161define <2 x double> @test_mm_cvtps_pd(<4 x float> %a0) nounwind {
1162; X32-LABEL: test_mm_cvtps_pd:
1163; X32:       # BB#0:
1164; X32-NEXT:    cvtps2pd %xmm0, %xmm0
1165; X32-NEXT:    retl
1166;
1167; X64-LABEL: test_mm_cvtps_pd:
1168; X64:       # BB#0:
1169; X64-NEXT:    cvtps2pd %xmm0, %xmm0
1170; X64-NEXT:    retq
1171  %ext = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
1172  %res = fpext <2 x float> %ext to <2 x double>
1173  ret <2 x double> %res
1174}
1175
1176define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
1177; X32-LABEL: test_mm_cvtsd_f64:
1178; X32:       # BB#0:
1179; X32-NEXT:    pushl %ebp
1180; X32-NEXT:    movl %esp, %ebp
1181; X32-NEXT:    andl $-8, %esp
1182; X32-NEXT:    subl $8, %esp
1183; X32-NEXT:    movlps %xmm0, (%esp)
1184; X32-NEXT:    fldl (%esp)
1185; X32-NEXT:    movl %ebp, %esp
1186; X32-NEXT:    popl %ebp
1187; X32-NEXT:    retl
1188;
1189; X64-LABEL: test_mm_cvtsd_f64:
1190; X64:       # BB#0:
1191; X64-NEXT:    retq
1192  %res = extractelement <2 x double> %a0, i32 0
1193  ret double %res
1194}
1195
1196define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind {
1197; X32-LABEL: test_mm_cvtsd_si32:
1198; X32:       # BB#0:
1199; X32-NEXT:    cvtsd2si %xmm0, %eax
1200; X32-NEXT:    retl
1201;
1202; X64-LABEL: test_mm_cvtsd_si32:
1203; X64:       # BB#0:
1204; X64-NEXT:    cvtsd2si %xmm0, %eax
1205; X64-NEXT:    retq
1206  %res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
1207  ret i32 %res
1208}
1209declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
1210
1211define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
1212; X32-LABEL: test_mm_cvtsi128_si32:
1213; X32:       # BB#0:
1214; X32-NEXT:    movd %xmm0, %eax
1215; X32-NEXT:    retl
1216;
1217; X64-LABEL: test_mm_cvtsi128_si32:
1218; X64:       # BB#0:
1219; X64-NEXT:    movd %xmm0, %eax
1220; X64-NEXT:    retq
1221  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1222  %res = extractelement <4 x i32> %arg0, i32 0
1223  ret i32 %res
1224}
1225
1226define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
1227; X32-LABEL: test_mm_cvtsi32_sd:
1228; X32:       # BB#0:
1229; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1230; X32-NEXT:    cvtsi2sdl %eax, %xmm1
1231; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1232; X32-NEXT:    retl
1233;
1234; X64-LABEL: test_mm_cvtsi32_sd:
1235; X64:       # BB#0:
1236; X64-NEXT:    cvtsi2sdl %edi, %xmm1
1237; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1238; X64-NEXT:    retq
1239  %cvt = sitofp i32 %a1 to double
1240  %res = insertelement <2 x double> %a0, double %cvt, i32 0
1241  ret <2 x double> %res
1242}
1243
1244define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
1245; X32-LABEL: test_mm_cvtsi32_si128:
1246; X32:       # BB#0:
1247; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1248; X32-NEXT:    retl
1249;
1250; X64-LABEL: test_mm_cvtsi32_si128:
1251; X64:       # BB#0:
1252; X64-NEXT:    movd %edi, %xmm0
1253; X64-NEXT:    retq
1254  %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
1255  %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
1256  %res2 = insertelement <4 x i32> %res1, i32 0, i32 2
1257  %res3 = insertelement <4 x i32> %res2, i32 0, i32 3
1258  %res = bitcast <4 x i32> %res3 to <2 x i64>
1259  ret <2 x i64> %res
1260}
1261
1262define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
1263; X32-LABEL: test_mm_cvtss_sd:
1264; X32:       # BB#0:
1265; X32-NEXT:    cvtss2sd %xmm1, %xmm1
1266; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1267; X32-NEXT:    retl
1268;
1269; X64-LABEL: test_mm_cvtss_sd:
1270; X64:       # BB#0:
1271; X64-NEXT:    cvtss2sd %xmm1, %xmm1
1272; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1273; X64-NEXT:    retq
1274  %ext = extractelement <4 x float> %a1, i32 0
1275  %cvt = fpext float %ext to double
1276  %res = insertelement <2 x double> %a0, double %cvt, i32 0
1277  ret <2 x double> %res
1278}
1279
1280define <2 x i64> @test_mm_cvttpd_epi32(<2 x double> %a0) nounwind {
1281; X32-LABEL: test_mm_cvttpd_epi32:
1282; X32:       # BB#0:
1283; X32-NEXT:    cvttpd2dq %xmm0, %xmm0
1284; X32-NEXT:    retl
1285;
1286; X64-LABEL: test_mm_cvttpd_epi32:
1287; X64:       # BB#0:
1288; X64-NEXT:    cvttpd2dq %xmm0, %xmm0
1289; X64-NEXT:    retq
1290  %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
1291  %bc = bitcast <4 x i32> %res to <2 x i64>
1292  ret <2 x i64> %bc
1293}
1294declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
1295
1296define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
1297; X32-LABEL: test_mm_cvttps_epi32:
1298; X32:       # BB#0:
1299; X32-NEXT:    cvttps2dq %xmm0, %xmm0
1300; X32-NEXT:    retl
1301;
1302; X64-LABEL: test_mm_cvttps_epi32:
1303; X64:       # BB#0:
1304; X64-NEXT:    cvttps2dq %xmm0, %xmm0
1305; X64-NEXT:    retq
1306  %res = fptosi <4 x float> %a0 to <4 x i32>
1307  %bc = bitcast <4 x i32> %res to <2 x i64>
1308  ret <2 x i64> %bc
1309}
1310
1311define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
1312; X32-LABEL: test_mm_cvttsd_si32:
1313; X32:       # BB#0:
1314; X32-NEXT:    cvttsd2si %xmm0, %eax
1315; X32-NEXT:    retl
1316;
1317; X64-LABEL: test_mm_cvttsd_si32:
1318; X64:       # BB#0:
1319; X64-NEXT:    cvttsd2si %xmm0, %eax
1320; X64-NEXT:    retq
1321  %ext = extractelement <2 x double> %a0, i32 0
1322  %res = fptosi double %ext to i32
1323  ret i32 %res
1324}
1325
1326define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
1327; X32-LABEL: test_mm_div_pd:
1328; X32:       # BB#0:
1329; X32-NEXT:    divpd %xmm1, %xmm0
1330; X32-NEXT:    retl
1331;
1332; X64-LABEL: test_mm_div_pd:
1333; X64:       # BB#0:
1334; X64-NEXT:    divpd %xmm1, %xmm0
1335; X64-NEXT:    retq
1336  %res = fdiv <2 x double> %a0, %a1
1337  ret <2 x double> %res
1338}
1339
1340define <2 x double> @test_mm_div_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1341; X32-LABEL: test_mm_div_sd:
1342; X32:       # BB#0:
1343; X32-NEXT:    divsd %xmm1, %xmm0
1344; X32-NEXT:    retl
1345;
1346; X64-LABEL: test_mm_div_sd:
1347; X64:       # BB#0:
1348; X64-NEXT:    divsd %xmm1, %xmm0
1349; X64-NEXT:    retq
1350  %ext0 = extractelement <2 x double> %a0, i32 0
1351  %ext1 = extractelement <2 x double> %a1, i32 0
1352  %fdiv = fdiv double %ext0, %ext1
1353  %res = insertelement <2 x double> %a0, double %fdiv, i32 0
1354  ret <2 x double> %res
1355}
1356
1357define i32 @test_mm_extract_epi16(<2 x i64> %a0) nounwind {
1358; X32-LABEL: test_mm_extract_epi16:
1359; X32:       # BB#0:
1360; X32-NEXT:    pextrw $1, %xmm0, %eax
1361; X32-NEXT:    movzwl %ax, %eax
1362; X32-NEXT:    retl
1363;
1364; X64-LABEL: test_mm_extract_epi16:
1365; X64:       # BB#0:
1366; X64-NEXT:    pextrw $1, %xmm0, %eax
1367; X64-NEXT:    movzwl %ax, %eax
1368; X64-NEXT:    retq
1369  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1370  %ext = extractelement <8 x i16> %arg0, i32 1
1371  %res = zext i16 %ext to i32
1372  ret i32 %res
1373}
1374
1375define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
1376; X32-LABEL: test_mm_insert_epi16:
1377; X32:       # BB#0:
1378; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
1379; X32-NEXT:    pinsrw $1, %eax, %xmm0
1380; X32-NEXT:    retl
1381;
1382; X64-LABEL: test_mm_insert_epi16:
1383; X64:       # BB#0:
1384; X64-NEXT:    pinsrw $1, %edi, %xmm0
1385; X64-NEXT:    retq
1386  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1387  %res = insertelement <8 x i16> %arg0, i16 %a1,i32 1
1388  %bc = bitcast <8 x i16> %res to <2 x i64>
1389  ret <2 x i64> %bc
1390}
1391
1392define void @test_mm_lfence() nounwind {
1393; X32-LABEL: test_mm_lfence:
1394; X32:       # BB#0:
1395; X32-NEXT:    lfence
1396; X32-NEXT:    retl
1397;
1398; X64-LABEL: test_mm_lfence:
1399; X64:       # BB#0:
1400; X64-NEXT:    lfence
1401; X64-NEXT:    retq
1402  call void @llvm.x86.sse2.lfence()
1403  ret void
1404}
1405declare void @llvm.x86.sse2.lfence() nounwind readnone
1406
1407define <2 x double> @test_mm_load_pd(double* %a0) nounwind {
1408; X32-LABEL: test_mm_load_pd:
1409; X32:       # BB#0:
1410; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1411; X32-NEXT:    movaps (%eax), %xmm0
1412; X32-NEXT:    retl
1413;
1414; X64-LABEL: test_mm_load_pd:
1415; X64:       # BB#0:
1416; X64-NEXT:    movaps (%rdi), %xmm0
1417; X64-NEXT:    retq
1418  %arg0 = bitcast double* %a0 to <2 x double>*
1419  %res = load <2 x double>, <2 x double>* %arg0, align 16
1420  ret <2 x double> %res
1421}
1422
1423define <2 x double> @test_mm_load_sd(double* %a0) nounwind {
1424; X32-LABEL: test_mm_load_sd:
1425; X32:       # BB#0:
1426; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1427; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1428; X32-NEXT:    retl
1429;
1430; X64-LABEL: test_mm_load_sd:
1431; X64:       # BB#0:
1432; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1433; X64-NEXT:    retq
1434  %ld = load double, double* %a0, align 1
1435  %res0 = insertelement <2 x double> undef, double %ld, i32 0
1436  %res1 = insertelement <2 x double> %res0, double 0.0, i32 1
1437  ret <2 x double> %res1
1438}
1439
1440define <2 x i64> @test_mm_load_si128(<2 x i64>* %a0) nounwind {
1441; X32-LABEL: test_mm_load_si128:
1442; X32:       # BB#0:
1443; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1444; X32-NEXT:    movaps (%eax), %xmm0
1445; X32-NEXT:    retl
1446;
1447; X64-LABEL: test_mm_load_si128:
1448; X64:       # BB#0:
1449; X64-NEXT:    movaps (%rdi), %xmm0
1450; X64-NEXT:    retq
1451  %res = load <2 x i64>, <2 x i64>* %a0, align 16
1452  ret <2 x i64> %res
1453}
1454
1455define <2 x double> @test_mm_load1_pd(double* %a0) nounwind {
1456; X32-LABEL: test_mm_load1_pd:
1457; X32:       # BB#0:
1458; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1459; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1460; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
1461; X32-NEXT:    retl
1462;
1463; X64-LABEL: test_mm_load1_pd:
1464; X64:       # BB#0:
1465; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1466; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
1467; X64-NEXT:    retq
1468  %ld = load double, double* %a0, align 8
1469  %res0 = insertelement <2 x double> undef, double %ld, i32 0
1470  %res1 = insertelement <2 x double> %res0, double %ld, i32 1
1471  ret <2 x double> %res1
1472}
1473
1474define <2 x double> @test_mm_loadh_pd(<2 x double> %a0, double* %a1) nounwind {
1475; X32-LABEL: test_mm_loadh_pd:
1476; X32:       # BB#0:
1477; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1478; X32-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1479; X32-NEXT:    retl
1480;
1481; X64-LABEL: test_mm_loadh_pd:
1482; X64:       # BB#0:
1483; X64-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1484; X64-NEXT:    retq
1485  %ld = load double, double* %a1, align 8
1486  %res = insertelement <2 x double> %a0, double %ld, i32 1
1487  ret <2 x double> %res
1488}
1489
1490define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, <2 x i64>* %a1) nounwind {
1491; X32-LABEL: test_mm_loadl_epi64:
1492; X32:       # BB#0:
1493; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1494; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1495; X32-NEXT:    retl
1496;
1497; X64-LABEL: test_mm_loadl_epi64:
1498; X64:       # BB#0:
1499; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1500; X64-NEXT:    retq
1501  %bc = bitcast <2 x i64>* %a1 to i64*
1502  %ld = load i64, i64* %bc, align 1
1503  %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
1504  %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
1505  ret <2 x i64> %res1
1506}
1507
1508define <2 x double> @test_mm_loadl_pd(<2 x double> %a0, double* %a1) nounwind {
1509; X32-LABEL: test_mm_loadl_pd:
1510; X32:       # BB#0:
1511; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1512; X32-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
1513; X32-NEXT:    retl
1514;
1515; X64-LABEL: test_mm_loadl_pd:
1516; X64:       # BB#0:
1517; X64-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
1518; X64-NEXT:    retq
1519  %ld = load double, double* %a1, align 8
1520  %res = insertelement <2 x double> %a0, double %ld, i32 0
1521  ret <2 x double> %res
1522}
1523
1524define <2 x double> @test_mm_loadr_pd(double* %a0) nounwind {
1525; X32-LABEL: test_mm_loadr_pd:
1526; X32:       # BB#0:
1527; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1528; X32-NEXT:    movapd (%eax), %xmm0
1529; X32-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
1530; X32-NEXT:    retl
1531;
1532; X64-LABEL: test_mm_loadr_pd:
1533; X64:       # BB#0:
1534; X64-NEXT:    movapd (%rdi), %xmm0
1535; X64-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
1536; X64-NEXT:    retq
1537  %arg0 = bitcast double* %a0 to <2 x double>*
1538  %ld = load <2 x double>, <2 x double>* %arg0, align 16
1539  %res = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1540  ret <2 x double> %res
1541}
1542
1543define <2 x double> @test_mm_loadu_pd(double* %a0) nounwind {
1544; X32-LABEL: test_mm_loadu_pd:
1545; X32:       # BB#0:
1546; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1547; X32-NEXT:    movups (%eax), %xmm0
1548; X32-NEXT:    retl
1549;
1550; X64-LABEL: test_mm_loadu_pd:
1551; X64:       # BB#0:
1552; X64-NEXT:    movups (%rdi), %xmm0
1553; X64-NEXT:    retq
1554  %arg0 = bitcast double* %a0 to <2 x double>*
1555  %res = load <2 x double>, <2 x double>* %arg0, align 1
1556  ret <2 x double> %res
1557}
1558
1559define <2 x i64> @test_mm_loadu_si128(<2 x i64>* %a0) nounwind {
1560; X32-LABEL: test_mm_loadu_si128:
1561; X32:       # BB#0:
1562; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1563; X32-NEXT:    movups (%eax), %xmm0
1564; X32-NEXT:    retl
1565;
1566; X64-LABEL: test_mm_loadu_si128:
1567; X64:       # BB#0:
1568; X64-NEXT:    movups (%rdi), %xmm0
1569; X64-NEXT:    retq
1570  %res = load <2 x i64>, <2 x i64>* %a0, align 1
1571  ret <2 x i64> %res
1572}
1573
1574define <2 x i64> @test_mm_madd_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1575; X32-LABEL: test_mm_madd_epi16:
1576; X32:       # BB#0:
1577; X32-NEXT:    pmaddwd %xmm1, %xmm0
1578; X32-NEXT:    retl
1579;
1580; X64-LABEL: test_mm_madd_epi16:
1581; X64:       # BB#0:
1582; X64-NEXT:    pmaddwd %xmm1, %xmm0
1583; X64-NEXT:    retq
1584  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1585  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
1586  %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %arg0, <8 x i16> %arg1)
1587  %bc = bitcast <4 x i32> %res to <2 x i64>
1588  ret <2 x i64> %bc
1589}
1590declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
1591
1592define void @test_mm_maskmoveu_si128(<2 x i64> %a0, <2 x i64> %a1, i8* %a2) nounwind {
1593; X32-LABEL: test_mm_maskmoveu_si128:
1594; X32:       # BB#0:
1595; X32-NEXT:    pushl %edi
1596; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
1597; X32-NEXT:    maskmovdqu %xmm1, %xmm0
1598; X32-NEXT:    popl %edi
1599; X32-NEXT:    retl
1600;
1601; X64-LABEL: test_mm_maskmoveu_si128:
1602; X64:       # BB#0:
1603; X64-NEXT:    maskmovdqu %xmm1, %xmm0
1604; X64-NEXT:    retq
1605  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
1606  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
1607  call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %arg0, <16 x i8> %arg1, i8* %a2)
1608  ret void
1609}
1610declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind
1611
1612define <2 x i64> @test_mm_max_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1613; X32-LABEL: test_mm_max_epi16:
1614; X32:       # BB#0:
1615; X32-NEXT:    pmaxsw %xmm1, %xmm0
1616; X32-NEXT:    retl
1617;
1618; X64-LABEL: test_mm_max_epi16:
1619; X64:       # BB#0:
1620; X64-NEXT:    pmaxsw %xmm1, %xmm0
1621; X64-NEXT:    retq
1622  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1623  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
1624  %cmp = icmp sgt <8 x i16> %arg0, %arg1
1625  %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
1626  %bc = bitcast <8 x i16> %sel to <2 x i64>
1627  ret <2 x i64> %bc
1628}
1629
1630define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1631; X32-LABEL: test_mm_max_epu8:
1632; X32:       # BB#0:
1633; X32-NEXT:    pmaxub %xmm1, %xmm0
1634; X32-NEXT:    retl
1635;
1636; X64-LABEL: test_mm_max_epu8:
1637; X64:       # BB#0:
1638; X64-NEXT:    pmaxub %xmm1, %xmm0
1639; X64-NEXT:    retq
1640  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
1641  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
1642  %cmp = icmp ugt <16 x i8> %arg0, %arg1
1643  %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
1644  %bc = bitcast <16 x i8> %sel to <2 x i64>
1645  ret <2 x i64> %bc
1646}
1647
1648define <2 x double> @test_mm_max_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
1649; X32-LABEL: test_mm_max_pd:
1650; X32:       # BB#0:
1651; X32-NEXT:    maxpd %xmm1, %xmm0
1652; X32-NEXT:    retl
1653;
1654; X64-LABEL: test_mm_max_pd:
1655; X64:       # BB#0:
1656; X64-NEXT:    maxpd %xmm1, %xmm0
1657; X64-NEXT:    retq
1658  %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
1659  ret <2 x double> %res
1660}
1661declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
1662
1663define <2 x double> @test_mm_max_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1664; X32-LABEL: test_mm_max_sd:
1665; X32:       # BB#0:
1666; X32-NEXT:    maxsd %xmm1, %xmm0
1667; X32-NEXT:    retl
1668;
1669; X64-LABEL: test_mm_max_sd:
1670; X64:       # BB#0:
1671; X64-NEXT:    maxsd %xmm1, %xmm0
1672; X64-NEXT:    retq
1673  %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
1674  ret <2 x double> %res
1675}
1676declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
1677
1678define void @test_mm_mfence() nounwind {
1679; X32-LABEL: test_mm_mfence:
1680; X32:       # BB#0:
1681; X32-NEXT:    mfence
1682; X32-NEXT:    retl
1683;
1684; X64-LABEL: test_mm_mfence:
1685; X64:       # BB#0:
1686; X64-NEXT:    mfence
1687; X64-NEXT:    retq
1688  call void @llvm.x86.sse2.mfence()
1689  ret void
1690}
1691declare void @llvm.x86.sse2.mfence() nounwind readnone
1692
1693define <2 x i64> @test_mm_min_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1694; X32-LABEL: test_mm_min_epi16:
1695; X32:       # BB#0:
1696; X32-NEXT:    pminsw %xmm1, %xmm0
1697; X32-NEXT:    retl
1698;
1699; X64-LABEL: test_mm_min_epi16:
1700; X64:       # BB#0:
1701; X64-NEXT:    pminsw %xmm1, %xmm0
1702; X64-NEXT:    retq
1703  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1704  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
1705  %cmp = icmp slt <8 x i16> %arg0, %arg1
1706  %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
1707  %bc = bitcast <8 x i16> %sel to <2 x i64>
1708  ret <2 x i64> %bc
1709}
1710
1711define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1712; X32-LABEL: test_mm_min_epu8:
1713; X32:       # BB#0:
1714; X32-NEXT:    pminub %xmm1, %xmm0
1715; X32-NEXT:    retl
1716;
1717; X64-LABEL: test_mm_min_epu8:
1718; X64:       # BB#0:
1719; X64-NEXT:    pminub %xmm1, %xmm0
1720; X64-NEXT:    retq
1721  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
1722  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
1723  %cmp = icmp ult <16 x i8> %arg0, %arg1
1724  %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
1725  %bc = bitcast <16 x i8> %sel to <2 x i64>
1726  ret <2 x i64> %bc
1727}
1728
1729define <2 x double> @test_mm_min_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
1730; X32-LABEL: test_mm_min_pd:
1731; X32:       # BB#0:
1732; X32-NEXT:    minpd %xmm1, %xmm0
1733; X32-NEXT:    retl
1734;
1735; X64-LABEL: test_mm_min_pd:
1736; X64:       # BB#0:
1737; X64-NEXT:    minpd %xmm1, %xmm0
1738; X64-NEXT:    retq
1739  %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
1740  ret <2 x double> %res
1741}
1742declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
1743
1744define <2 x double> @test_mm_min_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1745; X32-LABEL: test_mm_min_sd:
1746; X32:       # BB#0:
1747; X32-NEXT:    minsd %xmm1, %xmm0
1748; X32-NEXT:    retl
1749;
1750; X64-LABEL: test_mm_min_sd:
1751; X64:       # BB#0:
1752; X64-NEXT:    minsd %xmm1, %xmm0
1753; X64-NEXT:    retq
1754  %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
1755  ret <2 x double> %res
1756}
1757declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
1758
1759define <2 x i64> @test_mm_move_epi64(<2 x i64> %a0) nounwind {
1760; X32-LABEL: test_mm_move_epi64:
1761; X32:       # BB#0:
1762; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
1763; X32-NEXT:    retl
1764;
1765; X64-LABEL: test_mm_move_epi64:
1766; X64:       # BB#0:
1767; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
1768; X64-NEXT:    retq
1769  %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
1770  ret <2 x i64> %res
1771}
1772
1773define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1774; X32-LABEL: test_mm_move_sd:
1775; X32:       # BB#0:
1776; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1777; X32-NEXT:    retl
1778;
1779; X64-LABEL: test_mm_move_sd:
1780; X64:       # BB#0:
1781; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1782; X64-NEXT:    retq
1783  %ext0 = extractelement <2 x double> %a1, i32 0
1784  %res0 = insertelement <2 x double> undef, double %ext0, i32 0
1785  %ext1 = extractelement <2 x double> %a0, i32 1
1786  %res1 = insertelement <2 x double> %res0, double %ext1, i32 1
1787  ret <2 x double> %res1
1788}
1789
1790define i32 @test_mm_movemask_epi8(<2 x i64> %a0) nounwind {
1791; X32-LABEL: test_mm_movemask_epi8:
1792; X32:       # BB#0:
1793; X32-NEXT:    pmovmskb %xmm0, %eax
1794; X32-NEXT:    retl
1795;
1796; X64-LABEL: test_mm_movemask_epi8:
1797; X64:       # BB#0:
1798; X64-NEXT:    pmovmskb %xmm0, %eax
1799; X64-NEXT:    retq
1800  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
1801  %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %arg0)
1802  ret i32 %res
1803}
1804declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
1805
1806define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
1807; X32-LABEL: test_mm_movemask_pd:
1808; X32:       # BB#0:
1809; X32-NEXT:    movmskpd %xmm0, %eax
1810; X32-NEXT:    retl
1811;
1812; X64-LABEL: test_mm_movemask_pd:
1813; X64:       # BB#0:
1814; X64-NEXT:    movmskpd %xmm0, %eax
1815; X64-NEXT:    retq
1816  %res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
1817  ret i32 %res
1818}
1819declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
1820
1821define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) {
1822; X32-LABEL: test_mm_mul_epu32:
1823; X32:       # BB#0:
1824; X32-NEXT:    pmuludq %xmm1, %xmm0
1825; X32-NEXT:    retl
1826;
1827; X64-LABEL: test_mm_mul_epu32:
1828; X64:       # BB#0:
1829; X64-NEXT:    pmuludq %xmm1, %xmm0
1830; X64-NEXT:    retq
1831  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1832  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1833  %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %arg0, <4 x i32> %arg1)
1834  ret <2 x i64> %res
1835}
1836declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
1837
1838define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
1839; X32-LABEL: test_mm_mul_pd:
1840; X32:       # BB#0:
1841; X32-NEXT:    mulpd %xmm1, %xmm0
1842; X32-NEXT:    retl
1843;
1844; X64-LABEL: test_mm_mul_pd:
1845; X64:       # BB#0:
1846; X64-NEXT:    mulpd %xmm1, %xmm0
1847; X64-NEXT:    retq
1848  %res = fmul <2 x double> %a0, %a1
1849  ret <2 x double> %res
1850}
1851
1852define <2 x double> @test_mm_mul_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
1853; X32-LABEL: test_mm_mul_sd:
1854; X32:       # BB#0:
1855; X32-NEXT:    mulsd %xmm1, %xmm0
1856; X32-NEXT:    retl
1857;
1858; X64-LABEL: test_mm_mul_sd:
1859; X64:       # BB#0:
1860; X64-NEXT:    mulsd %xmm1, %xmm0
1861; X64-NEXT:    retq
1862  %ext0 = extractelement <2 x double> %a0, i32 0
1863  %ext1 = extractelement <2 x double> %a1, i32 0
1864  %fmul = fmul double %ext0, %ext1
1865  %res = insertelement <2 x double> %a0, double %fmul, i32 0
1866  ret <2 x double> %res
1867}
1868
1869define <2 x i64> @test_mm_mulhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
1870; X32-LABEL: test_mm_mulhi_epi16:
1871; X32:       # BB#0:
1872; X32-NEXT:    pmulhw %xmm1, %xmm0
1873; X32-NEXT:    retl
1874;
1875; X64-LABEL: test_mm_mulhi_epi16:
1876; X64:       # BB#0:
1877; X64-NEXT:    pmulhw %xmm1, %xmm0
1878; X64-NEXT:    retq
1879  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1880  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
1881  %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %arg0, <8 x i16> %arg1)
1882  %bc = bitcast <8 x i16> %res to <2 x i64>
1883  ret <2 x i64> %bc
1884}
1885declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
1886
1887define <2 x i64> @test_mm_mulhi_epu16(<2 x i64> %a0, <2 x i64> %a1) {
1888; X32-LABEL: test_mm_mulhi_epu16:
1889; X32:       # BB#0:
1890; X32-NEXT:    pmulhuw %xmm1, %xmm0
1891; X32-NEXT:    retl
1892;
1893; X64-LABEL: test_mm_mulhi_epu16:
1894; X64:       # BB#0:
1895; X64-NEXT:    pmulhuw %xmm1, %xmm0
1896; X64-NEXT:    retq
1897  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1898  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
1899  %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %arg0, <8 x i16> %arg1)
1900  %bc = bitcast <8 x i16> %res to <2 x i64>
1901  ret <2 x i64> %bc
1902}
1903declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
1904
1905define <2 x i64> @test_mm_mullo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
1906; X32-LABEL: test_mm_mullo_epi16:
1907; X32:       # BB#0:
1908; X32-NEXT:    pmullw %xmm1, %xmm0
1909; X32-NEXT:    retl
1910;
1911; X64-LABEL: test_mm_mullo_epi16:
1912; X64:       # BB#0:
1913; X64-NEXT:    pmullw %xmm1, %xmm0
1914; X64-NEXT:    retq
1915  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1916  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
1917  %res = mul <8 x i16> %arg0, %arg1
1918  %bc = bitcast <8 x i16> %res to <2 x i64>
1919  ret <2 x i64> %bc
1920}
1921
1922define <2 x double> @test_mm_or_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
1923; X32-LABEL: test_mm_or_pd:
1924; X32:       # BB#0:
1925; X32-NEXT:    orps %xmm1, %xmm0
1926; X32-NEXT:    retl
1927;
1928; X64-LABEL: test_mm_or_pd:
1929; X64:       # BB#0:
1930; X64-NEXT:    orps %xmm1, %xmm0
1931; X64-NEXT:    retq
1932  %arg0 = bitcast <2 x double> %a0 to <4 x i32>
1933  %arg1 = bitcast <2 x double> %a1 to <4 x i32>
1934  %res = or <4 x i32> %arg0, %arg1
1935  %bc = bitcast <4 x i32> %res to <2 x double>
1936  ret <2 x double> %bc
1937}
1938
1939define <2 x i64> @test_mm_or_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1940; X32-LABEL: test_mm_or_si128:
1941; X32:       # BB#0:
1942; X32-NEXT:    orps %xmm1, %xmm0
1943; X32-NEXT:    retl
1944;
1945; X64-LABEL: test_mm_or_si128:
1946; X64:       # BB#0:
1947; X64-NEXT:    orps %xmm1, %xmm0
1948; X64-NEXT:    retq
1949  %res = or <2 x i64> %a0, %a1
1950  ret <2 x i64> %res
1951}
1952
1953define <2 x i64> @test_mm_packs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
1954; X32-LABEL: test_mm_packs_epi16:
1955; X32:       # BB#0:
1956; X32-NEXT:    packsswb %xmm1, %xmm0
1957; X32-NEXT:    retl
1958;
1959; X64-LABEL: test_mm_packs_epi16:
1960; X64:       # BB#0:
1961; X64-NEXT:    packsswb %xmm1, %xmm0
1962; X64-NEXT:    retq
1963  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
1964  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
1965  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
1966  %bc = bitcast <16 x i8> %res to <2 x i64>
1967  ret <2 x i64> %bc
1968}
1969declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
1970
1971define <2 x i64> @test_mm_packs_epi32(<2 x i64> %a0, <2 x i64> %a1) {
1972; X32-LABEL: test_mm_packs_epi32:
1973; X32:       # BB#0:
1974; X32-NEXT:    packssdw %xmm1, %xmm0
1975; X32-NEXT:    retl
1976;
1977; X64-LABEL: test_mm_packs_epi32:
1978; X64:       # BB#0:
1979; X64-NEXT:    packssdw %xmm1, %xmm0
1980; X64-NEXT:    retq
1981  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1982  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1983  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %arg0, <4 x i32> %arg1)
1984  %bc = bitcast <8 x i16> %res to <2 x i64>
1985  ret <2 x i64> %bc
1986}
1987declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
1988
1989define <2 x i64> @test_mm_packus_epi16(<2 x i64> %a0, <2 x i64> %a1) {
1990; X32-LABEL: test_mm_packus_epi16:
1991; X32:       # BB#0:
1992; X32-NEXT:    packuswb %xmm1, %xmm0
1993; X32-NEXT:    retl
1994;
1995; X64-LABEL: test_mm_packus_epi16:
1996; X64:       # BB#0:
1997; X64-NEXT:    packuswb %xmm1, %xmm0
1998; X64-NEXT:    retq
1999  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
2000  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2001  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
2002  %bc = bitcast <16 x i8> %res to <2 x i64>
2003  ret <2 x i64> %bc
2004}
2005declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
2006
2007define void @test_mm_pause() nounwind {
2008; X32-LABEL: test_mm_pause:
2009; X32:       # BB#0:
2010; X32-NEXT:    pause
2011; X32-NEXT:    retl
2012;
2013; X64-LABEL: test_mm_pause:
2014; X64:       # BB#0:
2015; X64-NEXT:    pause
2016; X64-NEXT:    retq
2017  call void @llvm.x86.sse2.pause()
2018  ret void
2019}
2020declare void @llvm.x86.sse2.pause() nounwind readnone
2021
2022define <2 x i64> @test_mm_sad_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
2023; X32-LABEL: test_mm_sad_epu8:
2024; X32:       # BB#0:
2025; X32-NEXT:    psadbw %xmm1, %xmm0
2026; X32-NEXT:    retl
2027;
2028; X64-LABEL: test_mm_sad_epu8:
2029; X64:       # BB#0:
2030; X64-NEXT:    psadbw %xmm1, %xmm0
2031; X64-NEXT:    retq
2032  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
2033  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
2034  %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %arg0, <16 x i8> %arg1)
2035  ret <2 x i64> %res
2036}
2037declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
2038
2039define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
2040; X32-LABEL: test_mm_set_epi8:
2041; X32:       # BB#0:
2042; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2043; X32-NEXT:    movd %eax, %xmm0
2044; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2045; X32-NEXT:    movd %eax, %xmm1
2046; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2047; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2048; X32-NEXT:    movd %eax, %xmm0
2049; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2050; X32-NEXT:    movd %eax, %xmm2
2051; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2052; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2053; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2054; X32-NEXT:    movd %eax, %xmm0
2055; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2056; X32-NEXT:    movd %eax, %xmm3
2057; X32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2058; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2059; X32-NEXT:    movd %eax, %xmm0
2060; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2061; X32-NEXT:    movd %eax, %xmm1
2062; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2063; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2064; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2065; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2066; X32-NEXT:    movd %eax, %xmm0
2067; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2068; X32-NEXT:    movd %eax, %xmm2
2069; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2070; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2071; X32-NEXT:    movd %eax, %xmm0
2072; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2073; X32-NEXT:    movd %eax, %xmm3
2074; X32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2075; X32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2076; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2077; X32-NEXT:    movd %eax, %xmm0
2078; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2079; X32-NEXT:    movd %eax, %xmm2
2080; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2081; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2082; X32-NEXT:    movd %eax, %xmm4
2083; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2084; X32-NEXT:    movd %eax, %xmm0
2085; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2086; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2087; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2088; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2089; X32-NEXT:    retl
2090;
2091; X64-LABEL: test_mm_set_epi8:
2092; X64:       # BB#0:
2093; X64-NEXT:    movzbl %dil, %eax
2094; X64-NEXT:    movd %eax, %xmm0
2095; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2096; X64-NEXT:    movd %eax, %xmm1
2097; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2098; X64-NEXT:    movzbl %r8b, %eax
2099; X64-NEXT:    movd %eax, %xmm0
2100; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2101; X64-NEXT:    movd %eax, %xmm2
2102; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2103; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2104; X64-NEXT:    movzbl %dl, %eax
2105; X64-NEXT:    movd %eax, %xmm0
2106; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2107; X64-NEXT:    movd %eax, %xmm3
2108; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2109; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2110; X64-NEXT:    movd %eax, %xmm0
2111; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2112; X64-NEXT:    movd %eax, %xmm1
2113; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2114; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2115; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2116; X64-NEXT:    movzbl %sil, %eax
2117; X64-NEXT:    movd %eax, %xmm0
2118; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2119; X64-NEXT:    movd %eax, %xmm2
2120; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2121; X64-NEXT:    movzbl %r9b, %eax
2122; X64-NEXT:    movd %eax, %xmm0
2123; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2124; X64-NEXT:    movd %eax, %xmm3
2125; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2126; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2127; X64-NEXT:    movzbl %cl, %eax
2128; X64-NEXT:    movd %eax, %xmm0
2129; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2130; X64-NEXT:    movd %eax, %xmm2
2131; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2132; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2133; X64-NEXT:    movd %eax, %xmm4
2134; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2135; X64-NEXT:    movd %eax, %xmm0
2136; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2137; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2138; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2139; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2140; X64-NEXT:    retq
2141  %res0  = insertelement <16 x i8> undef,  i8 %a15, i32 0
2142  %res1  = insertelement <16 x i8> %res0,  i8 %a14, i32 1
2143  %res2  = insertelement <16 x i8> %res1,  i8 %a13, i32 2
2144  %res3  = insertelement <16 x i8> %res2,  i8 %a12, i32 3
2145  %res4  = insertelement <16 x i8> %res3,  i8 %a11, i32 4
2146  %res5  = insertelement <16 x i8> %res4,  i8 %a10, i32 5
2147  %res6  = insertelement <16 x i8> %res5,  i8 %a9 , i32 6
2148  %res7  = insertelement <16 x i8> %res6,  i8 %a8 , i32 7
2149  %res8  = insertelement <16 x i8> %res7,  i8 %a7 , i32 8
2150  %res9  = insertelement <16 x i8> %res8,  i8 %a6 , i32 9
2151  %res10 = insertelement <16 x i8> %res9,  i8 %a5 , i32 10
2152  %res11 = insertelement <16 x i8> %res10, i8 %a4 , i32 11
2153  %res12 = insertelement <16 x i8> %res11, i8 %a3 , i32 12
2154  %res13 = insertelement <16 x i8> %res12, i8 %a2 , i32 13
2155  %res14 = insertelement <16 x i8> %res13, i8 %a1 , i32 14
2156  %res15 = insertelement <16 x i8> %res14, i8 %a0 , i32 15
2157  %res = bitcast <16 x i8> %res15 to <2 x i64>
2158  ret <2 x i64> %res
2159}
2160
2161define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
2162; X32-LABEL: test_mm_set_epi16:
2163; X32:       # BB#0:
2164; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2165; X32-NEXT:    movd %eax, %xmm1
2166; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2167; X32-NEXT:    movd %eax, %xmm2
2168; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2169; X32-NEXT:    movd %eax, %xmm3
2170; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2171; X32-NEXT:    movd %eax, %xmm4
2172; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2173; X32-NEXT:    movd %eax, %xmm5
2174; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2175; X32-NEXT:    movd %eax, %xmm6
2176; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2177; X32-NEXT:    movd %eax, %xmm7
2178; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2179; X32-NEXT:    movd %eax, %xmm0
2180; X32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2181; X32-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2182; X32-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2183; X32-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2184; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
2185; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
2186; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
2187; X32-NEXT:    retl
2188;
2189; X64-LABEL: test_mm_set_epi16:
2190; X64:       # BB#0:
2191; X64-NEXT:    movw {{[0-9]+}}(%rsp), %r10w
2192; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2193; X64-NEXT:    movd %edi, %xmm0
2194; X64-NEXT:    movd %r8d, %xmm1
2195; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2196; X64-NEXT:    movd %edx, %xmm0
2197; X64-NEXT:    movd %eax, %xmm2
2198; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2199; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2200; X64-NEXT:    movd %esi, %xmm0
2201; X64-NEXT:    movd %r9d, %xmm1
2202; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2203; X64-NEXT:    movd %ecx, %xmm3
2204; X64-NEXT:    movd %r10d, %xmm0
2205; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2206; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2207; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2208; X64-NEXT:    retq
2209  %res0  = insertelement <8 x i16> undef, i16 %a7, i32 0
2210  %res1  = insertelement <8 x i16> %res0, i16 %a6, i32 1
2211  %res2  = insertelement <8 x i16> %res1, i16 %a5, i32 2
2212  %res3  = insertelement <8 x i16> %res2, i16 %a4, i32 3
2213  %res4  = insertelement <8 x i16> %res3, i16 %a3, i32 4
2214  %res5  = insertelement <8 x i16> %res4, i16 %a2, i32 5
2215  %res6  = insertelement <8 x i16> %res5, i16 %a1, i32 6
2216  %res7  = insertelement <8 x i16> %res6, i16 %a0, i32 7
2217  %res = bitcast <8 x i16> %res7 to <2 x i64>
2218  ret <2 x i64> %res
2219}
2220
2221define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
2222; X32-LABEL: test_mm_set_epi32:
2223; X32:       # BB#0:
2224; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2225; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2226; X32-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2227; X32-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2228; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2229; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2230; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2231; X32-NEXT:    retl
2232;
2233; X64-LABEL: test_mm_set_epi32:
2234; X64:       # BB#0:
2235; X64-NEXT:    movd %edi, %xmm0
2236; X64-NEXT:    movd %edx, %xmm1
2237; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2238; X64-NEXT:    movd %esi, %xmm2
2239; X64-NEXT:    movd %ecx, %xmm0
2240; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2241; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2242; X64-NEXT:    retq
2243  %res0  = insertelement <4 x i32> undef, i32 %a3, i32 0
2244  %res1  = insertelement <4 x i32> %res0, i32 %a2, i32 1
2245  %res2  = insertelement <4 x i32> %res1, i32 %a1, i32 2
2246  %res3  = insertelement <4 x i32> %res2, i32 %a0, i32 3
2247  %res = bitcast <4 x i32> %res3 to <2 x i64>
2248  ret <2 x i64> %res
2249}
2250
2251; TODO test_mm_set_epi64
2252
2253define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
2254; X32-LABEL: test_mm_set_epi64x:
2255; X32:       # BB#0:
2256; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2257; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2258; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2259; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2260; X32-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2261; X32-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2262; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2263; X32-NEXT:    retl
2264;
2265; X64-LABEL: test_mm_set_epi64x:
2266; X64:       # BB#0:
2267; X64-NEXT:    movd %rdi, %xmm1
2268; X64-NEXT:    movd %rsi, %xmm0
2269; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2270; X64-NEXT:    retq
2271  %res0  = insertelement <2 x i64> undef, i64 %a1, i32 0
2272  %res1  = insertelement <2 x i64> %res0, i64 %a0, i32 1
2273  ret <2 x i64> %res1
2274}
2275
2276define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
2277; X32-LABEL: test_mm_set_pd:
2278; X32:       # BB#0:
2279; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2280; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
2281; X32-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2282; X32-NEXT:    retl
2283;
2284; X64-LABEL: test_mm_set_pd:
2285; X64:       # BB#0:
2286; X64-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2287; X64-NEXT:    movapd %xmm1, %xmm0
2288; X64-NEXT:    retq
2289  %res0  = insertelement <2 x double> undef, double %a1, i32 0
2290  %res1  = insertelement <2 x double> %res0, double %a0, i32 1
2291  ret <2 x double> %res1
2292}
2293
2294define <2 x double> @test_mm_set_sd(double %a0) nounwind {
2295; X32-LABEL: test_mm_set_sd:
2296; X32:       # BB#0:
2297; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2298; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
2299; X32-NEXT:    retl
2300;
2301; X64-LABEL: test_mm_set_sd:
2302; X64:       # BB#0:
2303; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
2304; X64-NEXT:    retq
2305  %res0  = insertelement <2 x double> undef, double %a0, i32 0
2306  %res1  = insertelement <2 x double> %res0, double 0.0, i32 1
2307  ret <2 x double> %res1
2308}
2309
2310define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
2311; X32-LABEL: test_mm_set1_epi8:
2312; X32:       # BB#0:
2313; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2314; X32-NEXT:    movd %eax, %xmm0
2315; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2316; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2317; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2318; X32-NEXT:    retl
2319;
2320; X64-LABEL: test_mm_set1_epi8:
2321; X64:       # BB#0:
2322; X64-NEXT:    movzbl %dil, %eax
2323; X64-NEXT:    movd %eax, %xmm0
2324; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2325; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2326; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2327; X64-NEXT:    retq
2328  %res0  = insertelement <16 x i8> undef,  i8 %a0, i32 0
2329  %res1  = insertelement <16 x i8> %res0,  i8 %a0, i32 1
2330  %res2  = insertelement <16 x i8> %res1,  i8 %a0, i32 2
2331  %res3  = insertelement <16 x i8> %res2,  i8 %a0, i32 3
2332  %res4  = insertelement <16 x i8> %res3,  i8 %a0, i32 4
2333  %res5  = insertelement <16 x i8> %res4,  i8 %a0, i32 5
2334  %res6  = insertelement <16 x i8> %res5,  i8 %a0, i32 6
2335  %res7  = insertelement <16 x i8> %res6,  i8 %a0, i32 7
2336  %res8  = insertelement <16 x i8> %res7,  i8 %a0, i32 8
2337  %res9  = insertelement <16 x i8> %res8,  i8 %a0, i32 9
2338  %res10 = insertelement <16 x i8> %res9,  i8 %a0, i32 10
2339  %res11 = insertelement <16 x i8> %res10, i8 %a0, i32 11
2340  %res12 = insertelement <16 x i8> %res11, i8 %a0, i32 12
2341  %res13 = insertelement <16 x i8> %res12, i8 %a0, i32 13
2342  %res14 = insertelement <16 x i8> %res13, i8 %a0, i32 14
2343  %res15 = insertelement <16 x i8> %res14, i8 %a0, i32 15
2344  %res = bitcast <16 x i8> %res15 to <2 x i64>
2345  ret <2 x i64> %res
2346}
2347
2348define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
2349; X32-LABEL: test_mm_set1_epi16:
2350; X32:       # BB#0:
2351; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2352; X32-NEXT:    movd %eax, %xmm0
2353; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2354; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2355; X32-NEXT:    retl
2356;
2357; X64-LABEL: test_mm_set1_epi16:
2358; X64:       # BB#0:
2359; X64-NEXT:    movd %edi, %xmm0
2360; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2361; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2362; X64-NEXT:    retq
2363  %res0  = insertelement <8 x i16> undef, i16 %a0, i32 0
2364  %res1  = insertelement <8 x i16> %res0, i16 %a0, i32 1
2365  %res2  = insertelement <8 x i16> %res1, i16 %a0, i32 2
2366  %res3  = insertelement <8 x i16> %res2, i16 %a0, i32 3
2367  %res4  = insertelement <8 x i16> %res3, i16 %a0, i32 4
2368  %res5  = insertelement <8 x i16> %res4, i16 %a0, i32 5
2369  %res6  = insertelement <8 x i16> %res5, i16 %a0, i32 6
2370  %res7  = insertelement <8 x i16> %res6, i16 %a0, i32 7
2371  %res = bitcast <8 x i16> %res7 to <2 x i64>
2372  ret <2 x i64> %res
2373}
2374
2375define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
2376; X32-LABEL: test_mm_set1_epi32:
2377; X32:       # BB#0:
2378; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2379; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2380; X32-NEXT:    retl
2381;
2382; X64-LABEL: test_mm_set1_epi32:
2383; X64:       # BB#0:
2384; X64-NEXT:    movd %edi, %xmm0
2385; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2386; X64-NEXT:    retq
2387  %res0  = insertelement <4 x i32> undef, i32 %a0, i32 0
2388  %res1  = insertelement <4 x i32> %res0, i32 %a0, i32 1
2389  %res2  = insertelement <4 x i32> %res1, i32 %a0, i32 2
2390  %res3  = insertelement <4 x i32> %res2, i32 %a0, i32 3
2391  %res = bitcast <4 x i32> %res3 to <2 x i64>
2392  ret <2 x i64> %res
2393}
2394
2395; TODO test_mm_set1_epi64
2396
2397define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
2398; X32-LABEL: test_mm_set1_epi64x:
2399; X32:       # BB#0:
2400; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2401; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2402; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2403; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2404; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2405; X32-NEXT:    retl
2406;
2407; X64-LABEL: test_mm_set1_epi64x:
2408; X64:       # BB#0:
2409; X64-NEXT:    movd %rdi, %xmm0
2410; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2411; X64-NEXT:    retq
2412  %res0  = insertelement <2 x i64> undef, i64 %a0, i32 0
2413  %res1  = insertelement <2 x i64> %res0, i64 %a0, i32 1
2414  ret <2 x i64> %res1
2415}
2416
2417define <2 x double> @test_mm_set1_pd(double %a0) nounwind {
2418; X32-LABEL: test_mm_set1_pd:
2419; X32:       # BB#0:
2420; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2421; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2422; X32-NEXT:    retl
2423;
2424; X64-LABEL: test_mm_set1_pd:
2425; X64:       # BB#0:
2426; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2427; X64-NEXT:    retq
2428  %res0  = insertelement <2 x double> undef, double %a0, i32 0
2429  %res1  = insertelement <2 x double> %res0, double %a0, i32 1
2430  ret <2 x double> %res1
2431}
2432
2433define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
2434; X32-LABEL: test_mm_setr_epi8:
2435; X32:       # BB#0:
2436; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2437; X32-NEXT:    movd %eax, %xmm0
2438; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2439; X32-NEXT:    movd %eax, %xmm1
2440; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2441; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2442; X32-NEXT:    movd %eax, %xmm0
2443; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2444; X32-NEXT:    movd %eax, %xmm2
2445; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2446; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2447; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2448; X32-NEXT:    movd %eax, %xmm0
2449; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2450; X32-NEXT:    movd %eax, %xmm3
2451; X32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2452; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2453; X32-NEXT:    movd %eax, %xmm0
2454; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2455; X32-NEXT:    movd %eax, %xmm1
2456; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2457; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2458; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2459; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2460; X32-NEXT:    movd %eax, %xmm0
2461; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2462; X32-NEXT:    movd %eax, %xmm2
2463; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2464; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2465; X32-NEXT:    movd %eax, %xmm0
2466; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2467; X32-NEXT:    movd %eax, %xmm3
2468; X32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2469; X32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2470; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2471; X32-NEXT:    movd %eax, %xmm0
2472; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2473; X32-NEXT:    movd %eax, %xmm2
2474; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2475; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2476; X32-NEXT:    movd %eax, %xmm4
2477; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2478; X32-NEXT:    movd %eax, %xmm0
2479; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2480; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2481; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2482; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2483; X32-NEXT:    retl
2484;
2485; X64-LABEL: test_mm_setr_epi8:
2486; X64:       # BB#0:
2487; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2488; X64-NEXT:    movd %eax, %xmm0
2489; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2490; X64-NEXT:    movd %eax, %xmm1
2491; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2492; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2493; X64-NEXT:    movd %eax, %xmm0
2494; X64-NEXT:    movzbl %cl, %eax
2495; X64-NEXT:    movd %eax, %xmm2
2496; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2497; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2498; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2499; X64-NEXT:    movd %eax, %xmm0
2500; X64-NEXT:    movzbl %r9b, %eax
2501; X64-NEXT:    movd %eax, %xmm3
2502; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2503; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2504; X64-NEXT:    movd %eax, %xmm0
2505; X64-NEXT:    movzbl %sil, %eax
2506; X64-NEXT:    movd %eax, %xmm1
2507; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2508; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2509; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2510; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2511; X64-NEXT:    movd %eax, %xmm0
2512; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2513; X64-NEXT:    movd %eax, %xmm2
2514; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2515; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2516; X64-NEXT:    movd %eax, %xmm0
2517; X64-NEXT:    movzbl %dl, %eax
2518; X64-NEXT:    movd %eax, %xmm3
2519; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2520; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2521; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2522; X64-NEXT:    movd %eax, %xmm0
2523; X64-NEXT:    movzbl %r8b, %eax
2524; X64-NEXT:    movd %eax, %xmm2
2525; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2526; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2527; X64-NEXT:    movd %eax, %xmm4
2528; X64-NEXT:    movzbl %dil, %eax
2529; X64-NEXT:    movd %eax, %xmm0
2530; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2531; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2532; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2533; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2534; X64-NEXT:    retq
2535  %res0  = insertelement <16 x i8> undef,  i8 %a0 , i32 0
2536  %res1  = insertelement <16 x i8> %res0,  i8 %a1 , i32 1
2537  %res2  = insertelement <16 x i8> %res1,  i8 %a2 , i32 2
2538  %res3  = insertelement <16 x i8> %res2,  i8 %a3 , i32 3
2539  %res4  = insertelement <16 x i8> %res3,  i8 %a4 , i32 4
2540  %res5  = insertelement <16 x i8> %res4,  i8 %a5 , i32 5
2541  %res6  = insertelement <16 x i8> %res5,  i8 %a6 , i32 6
2542  %res7  = insertelement <16 x i8> %res6,  i8 %a7 , i32 7
2543  %res8  = insertelement <16 x i8> %res7,  i8 %a8 , i32 8
2544  %res9  = insertelement <16 x i8> %res8,  i8 %a9 , i32 9
2545  %res10 = insertelement <16 x i8> %res9,  i8 %a10, i32 10
2546  %res11 = insertelement <16 x i8> %res10, i8 %a11, i32 11
2547  %res12 = insertelement <16 x i8> %res11, i8 %a12, i32 12
2548  %res13 = insertelement <16 x i8> %res12, i8 %a13, i32 13
2549  %res14 = insertelement <16 x i8> %res13, i8 %a14, i32 14
2550  %res15 = insertelement <16 x i8> %res14, i8 %a15, i32 15
2551  %res = bitcast <16 x i8> %res15 to <2 x i64>
2552  ret <2 x i64> %res
2553}
2554
2555define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
2556; X32-LABEL: test_mm_setr_epi16:
2557; X32:       # BB#0:
2558; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2559; X32-NEXT:    movd %eax, %xmm1
2560; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2561; X32-NEXT:    movd %eax, %xmm2
2562; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2563; X32-NEXT:    movd %eax, %xmm3
2564; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2565; X32-NEXT:    movd %eax, %xmm4
2566; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2567; X32-NEXT:    movd %eax, %xmm5
2568; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2569; X32-NEXT:    movd %eax, %xmm6
2570; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2571; X32-NEXT:    movd %eax, %xmm7
2572; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2573; X32-NEXT:    movd %eax, %xmm0
2574; X32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2575; X32-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2576; X32-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2577; X32-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2578; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
2579; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
2580; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
2581; X32-NEXT:    retl
2582;
2583; X64-LABEL: test_mm_setr_epi16:
2584; X64:       # BB#0:
2585; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2586; X64-NEXT:    movw {{[0-9]+}}(%rsp), %r10w
2587; X64-NEXT:    movd %eax, %xmm0
2588; X64-NEXT:    movd %ecx, %xmm1
2589; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2590; X64-NEXT:    movd %r9d, %xmm0
2591; X64-NEXT:    movd %esi, %xmm2
2592; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2593; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2594; X64-NEXT:    movd %r10d, %xmm0
2595; X64-NEXT:    movd %edx, %xmm1
2596; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2597; X64-NEXT:    movd %r8d, %xmm3
2598; X64-NEXT:    movd %edi, %xmm0
2599; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2600; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2601; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2602; X64-NEXT:    retq
2603  %res0  = insertelement <8 x i16> undef, i16 %a0, i32 0
2604  %res1  = insertelement <8 x i16> %res0, i16 %a1, i32 1
2605  %res2  = insertelement <8 x i16> %res1, i16 %a2, i32 2
2606  %res3  = insertelement <8 x i16> %res2, i16 %a3, i32 3
2607  %res4  = insertelement <8 x i16> %res3, i16 %a4, i32 4
2608  %res5  = insertelement <8 x i16> %res4, i16 %a5, i32 5
2609  %res6  = insertelement <8 x i16> %res5, i16 %a6, i32 6
2610  %res7  = insertelement <8 x i16> %res6, i16 %a7, i32 7
2611  %res = bitcast <8 x i16> %res7 to <2 x i64>
2612  ret <2 x i64> %res
2613}
2614
2615define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
2616; X32-LABEL: test_mm_setr_epi32:
2617; X32:       # BB#0:
2618; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2619; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2620; X32-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2621; X32-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2622; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2623; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2624; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2625; X32-NEXT:    retl
2626;
2627; X64-LABEL: test_mm_setr_epi32:
2628; X64:       # BB#0:
2629; X64-NEXT:    movd %ecx, %xmm0
2630; X64-NEXT:    movd %esi, %xmm1
2631; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2632; X64-NEXT:    movd %edx, %xmm2
2633; X64-NEXT:    movd %edi, %xmm0
2634; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2635; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2636; X64-NEXT:    retq
2637  %res0  = insertelement <4 x i32> undef, i32 %a0, i32 0
2638  %res1  = insertelement <4 x i32> %res0, i32 %a1, i32 1
2639  %res2  = insertelement <4 x i32> %res1, i32 %a2, i32 2
2640  %res3  = insertelement <4 x i32> %res2, i32 %a3, i32 3
2641  %res = bitcast <4 x i32> %res3 to <2 x i64>
2642  ret <2 x i64> %res
2643}
2644
2645; TODO test_mm_setr_epi64
2646
2647define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
2648; X32-LABEL: test_mm_setr_epi64x:
2649; X32:       # BB#0:
2650; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2651; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2652; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2653; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2654; X32-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2655; X32-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2656; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2657; X32-NEXT:    retl
2658;
2659; X64-LABEL: test_mm_setr_epi64x:
2660; X64:       # BB#0:
2661; X64-NEXT:    movd %rsi, %xmm1
2662; X64-NEXT:    movd %rdi, %xmm0
2663; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2664; X64-NEXT:    retq
2665  %res0  = insertelement <2 x i64> undef, i64 %a0, i32 0
2666  %res1  = insertelement <2 x i64> %res0, i64 %a1, i32 1
2667  ret <2 x i64> %res1
2668}
2669
2670define <2 x double> @test_mm_setr_pd(double %a0, double %a1) nounwind {
2671; X32-LABEL: test_mm_setr_pd:
2672; X32:       # BB#0:
2673; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
2674; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2675; X32-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2676; X32-NEXT:    retl
2677;
2678; X64-LABEL: test_mm_setr_pd:
2679; X64:       # BB#0:
2680; X64-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2681; X64-NEXT:    retq
2682  %res0  = insertelement <2 x double> undef, double %a0, i32 0
2683  %res1  = insertelement <2 x double> %res0, double %a1, i32 1
2684  ret <2 x double> %res1
2685}
2686
2687define <2 x double> @test_mm_setzero_pd() {
2688; X32-LABEL: test_mm_setzero_pd:
2689; X32:       # BB#0:
2690; X32-NEXT:    xorps %xmm0, %xmm0
2691; X32-NEXT:    retl
2692;
2693; X64-LABEL: test_mm_setzero_pd:
2694; X64:       # BB#0:
2695; X64-NEXT:    xorps %xmm0, %xmm0
2696; X64-NEXT:    retq
2697  ret <2 x double> zeroinitializer
2698}
2699
2700define <2 x i64> @test_mm_setzero_si128() {
2701; X32-LABEL: test_mm_setzero_si128:
2702; X32:       # BB#0:
2703; X32-NEXT:    xorps %xmm0, %xmm0
2704; X32-NEXT:    retl
2705;
2706; X64-LABEL: test_mm_setzero_si128:
2707; X64:       # BB#0:
2708; X64-NEXT:    xorps %xmm0, %xmm0
2709; X64-NEXT:    retq
2710  ret <2 x i64> zeroinitializer
2711}
2712
2713define <2 x i64> @test_mm_shuffle_epi32(<2 x i64> %a0) {
2714; X32-LABEL: test_mm_shuffle_epi32:
2715; X32:       # BB#0:
2716; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2717; X32-NEXT:    retl
2718;
2719; X64-LABEL: test_mm_shuffle_epi32:
2720; X64:       # BB#0:
2721; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2722; X64-NEXT:    retq
2723  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2724  %res = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
2725  %bc = bitcast <4 x i32> %res to <2 x i64>
2726  ret <2 x i64> %bc
2727}
2728
2729define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
2730; X32-LABEL: test_mm_shuffle_pd:
2731; X32:       # BB#0:
2732; X32-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
2733; X32-NEXT:    retl
2734;
2735; X64-LABEL: test_mm_shuffle_pd:
2736; X64:       # BB#0:
2737; X64-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
2738; X64-NEXT:    retq
2739  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
2740  ret <2 x double> %res
2741}
2742
2743define <2 x i64> @test_mm_shufflehi_epi16(<2 x i64> %a0) {
2744; X32-LABEL: test_mm_shufflehi_epi16:
2745; X32:       # BB#0:
2746; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
2747; X32-NEXT:    retl
2748;
2749; X64-LABEL: test_mm_shufflehi_epi16:
2750; X64:       # BB#0:
2751; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
2752; X64-NEXT:    retq
2753  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
2754  %res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
2755  %bc = bitcast <8 x i16> %res to <2 x i64>
2756  ret <2 x i64> %bc
2757}
2758
2759define <2 x i64> @test_mm_shufflelo_epi16(<2 x i64> %a0) {
2760; X32-LABEL: test_mm_shufflelo_epi16:
2761; X32:       # BB#0:
2762; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2763; X32-NEXT:    retl
2764;
2765; X64-LABEL: test_mm_shufflelo_epi16:
2766; X64:       # BB#0:
2767; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2768; X64-NEXT:    retq
2769  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
2770  %res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
2771  %bc = bitcast <8 x i16> %res to <2 x i64>
2772  ret <2 x i64> %bc
2773}
2774
2775define <2 x i64> @test_mm_sll_epi16(<2 x i64> %a0, <2 x i64> %a1) {
2776; X32-LABEL: test_mm_sll_epi16:
2777; X32:       # BB#0:
2778; X32-NEXT:    psllw %xmm1, %xmm0
2779; X32-NEXT:    retl
2780;
2781; X64-LABEL: test_mm_sll_epi16:
2782; X64:       # BB#0:
2783; X64-NEXT:    psllw %xmm1, %xmm0
2784; X64-NEXT:    retq
2785  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
2786  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2787  %res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %arg0, <8 x i16> %arg1)
2788  %bc = bitcast <8 x i16> %res to <2 x i64>
2789  ret <2 x i64> %bc
2790}
2791declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
2792
2793define <2 x i64> @test_mm_sll_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2794; X32-LABEL: test_mm_sll_epi32:
2795; X32:       # BB#0:
2796; X32-NEXT:    pslld %xmm1, %xmm0
2797; X32-NEXT:    retl
2798;
2799; X64-LABEL: test_mm_sll_epi32:
2800; X64:       # BB#0:
2801; X64-NEXT:    pslld %xmm1, %xmm0
2802; X64-NEXT:    retq
2803  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2804  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2805  %res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %arg0, <4 x i32> %arg1)
2806  %bc = bitcast <4 x i32> %res to <2 x i64>
2807  ret <2 x i64> %bc
2808}
2809declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
2810
2811define <2 x i64> @test_mm_sll_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2812; X32-LABEL: test_mm_sll_epi64:
2813; X32:       # BB#0:
2814; X32-NEXT:    psllq %xmm1, %xmm0
2815; X32-NEXT:    retl
2816;
2817; X64-LABEL: test_mm_sll_epi64:
2818; X64:       # BB#0:
2819; X64-NEXT:    psllq %xmm1, %xmm0
2820; X64-NEXT:    retq
2821  %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
2822  ret <2 x i64> %res
2823}
2824declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
2825
2826define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) {
2827; X32-LABEL: test_mm_slli_epi16:
2828; X32:       # BB#0:
2829; X32-NEXT:    psllw $1, %xmm0
2830; X32-NEXT:    retl
2831;
2832; X64-LABEL: test_mm_slli_epi16:
2833; X64:       # BB#0:
2834; X64-NEXT:    psllw $1, %xmm0
2835; X64-NEXT:    retq
2836  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
2837  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1)
2838  %bc = bitcast <8 x i16> %res to <2 x i64>
2839  ret <2 x i64> %bc
2840}
2841declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
2842
2843define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) {
2844; X32-LABEL: test_mm_slli_epi32:
2845; X32:       # BB#0:
2846; X32-NEXT:    pslld $1, %xmm0
2847; X32-NEXT:    retl
2848;
2849; X64-LABEL: test_mm_slli_epi32:
2850; X64:       # BB#0:
2851; X64-NEXT:    pslld $1, %xmm0
2852; X64-NEXT:    retq
2853  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2854  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1)
2855  %bc = bitcast <4 x i32> %res to <2 x i64>
2856  ret <2 x i64> %bc
2857}
2858declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
2859
2860define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) {
2861; X32-LABEL: test_mm_slli_epi64:
2862; X32:       # BB#0:
2863; X32-NEXT:    psllq $1, %xmm0
2864; X32-NEXT:    retl
2865;
2866; X64-LABEL: test_mm_slli_epi64:
2867; X64:       # BB#0:
2868; X64-NEXT:    psllq $1, %xmm0
2869; X64-NEXT:    retq
2870  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1)
2871  ret <2 x i64> %res
2872}
2873declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
2874
2875define <2 x i64> @test_mm_slli_si128(<2 x i64> %a0) nounwind {
2876; X32-LABEL: test_mm_slli_si128:
2877; X32:       # BB#0:
2878; X32-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
2879; X32-NEXT:    retl
2880;
2881; X64-LABEL: test_mm_slli_si128:
2882; X64:       # BB#0:
2883; X64-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
2884; X64-NEXT:    retq
2885  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
2886  %res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
2887  %bc = bitcast <16 x i8> %res to <2 x i64>
2888  ret <2 x i64> %bc
2889}
2890
2891define <2 x double> @test_mm_sqrt_pd(<2 x double> %a0) nounwind {
2892; X32-LABEL: test_mm_sqrt_pd:
2893; X32:       # BB#0:
2894; X32-NEXT:    sqrtpd %xmm0, %xmm0
2895; X32-NEXT:    retl
2896;
2897; X64-LABEL: test_mm_sqrt_pd:
2898; X64:       # BB#0:
2899; X64-NEXT:    sqrtpd %xmm0, %xmm0
2900; X64-NEXT:    retq
2901  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
2902  ret <2 x double> %res
2903}
2904declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
2905
2906define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
2907; X32-LABEL: test_mm_sqrt_sd:
2908; X32:       # BB#0:
2909; X32-NEXT:    sqrtsd %xmm0, %xmm1
2910; X32-NEXT:    movaps %xmm1, %xmm0
2911; X32-NEXT:    retl
2912;
2913; X64-LABEL: test_mm_sqrt_sd:
2914; X64:       # BB#0:
2915; X64-NEXT:    sqrtsd %xmm0, %xmm1
2916; X64-NEXT:    movaps %xmm1, %xmm0
2917; X64-NEXT:    retq
2918  %call = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
2919  %ext0 = extractelement <2 x double> %call, i32 0
2920  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
2921  %ext1 = extractelement <2 x double> %a1, i32 1
2922  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
2923  ret <2 x double> %ins1
2924}
2925declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
2926
2927define <2 x i64> @test_mm_sra_epi16(<2 x i64> %a0, <2 x i64> %a1) {
2928; X32-LABEL: test_mm_sra_epi16:
2929; X32:       # BB#0:
2930; X32-NEXT:    psraw %xmm1, %xmm0
2931; X32-NEXT:    retl
2932;
2933; X64-LABEL: test_mm_sra_epi16:
2934; X64:       # BB#0:
2935; X64-NEXT:    psraw %xmm1, %xmm0
2936; X64-NEXT:    retq
2937  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
2938  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2939  %res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %arg0, <8 x i16> %arg1)
2940  %bc = bitcast <8 x i16> %res to <2 x i64>
2941  ret <2 x i64> %bc
2942}
2943declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
2944
2945define <2 x i64> @test_mm_sra_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2946; X32-LABEL: test_mm_sra_epi32:
2947; X32:       # BB#0:
2948; X32-NEXT:    psrad %xmm1, %xmm0
2949; X32-NEXT:    retl
2950;
2951; X64-LABEL: test_mm_sra_epi32:
2952; X64:       # BB#0:
2953; X64-NEXT:    psrad %xmm1, %xmm0
2954; X64-NEXT:    retq
2955  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2956  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2957  %res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %arg0, <4 x i32> %arg1)
2958  %bc = bitcast <4 x i32> %res to <2 x i64>
2959  ret <2 x i64> %bc
2960}
2961declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
2962
2963define <2 x i64> @test_mm_srai_epi16(<2 x i64> %a0) {
2964; X32-LABEL: test_mm_srai_epi16:
2965; X32:       # BB#0:
2966; X32-NEXT:    psraw $1, %xmm0
2967; X32-NEXT:    retl
2968;
2969; X64-LABEL: test_mm_srai_epi16:
2970; X64:       # BB#0:
2971; X64-NEXT:    psraw $1, %xmm0
2972; X64-NEXT:    retq
2973  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
2974  %res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %arg0, i32 1)
2975  %bc = bitcast <8 x i16> %res to <2 x i64>
2976  ret <2 x i64> %bc
2977}
2978declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
2979
2980define <2 x i64> @test_mm_srai_epi32(<2 x i64> %a0) {
2981; X32-LABEL: test_mm_srai_epi32:
2982; X32:       # BB#0:
2983; X32-NEXT:    psrad $1, %xmm0
2984; X32-NEXT:    retl
2985;
2986; X64-LABEL: test_mm_srai_epi32:
2987; X64:       # BB#0:
2988; X64-NEXT:    psrad $1, %xmm0
2989; X64-NEXT:    retq
2990  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2991  %res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %arg0, i32 1)
2992  %bc = bitcast <4 x i32> %res to <2 x i64>
2993  ret <2 x i64> %bc
2994}
2995declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
2996
2997define <2 x i64> @test_mm_srl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
2998; X32-LABEL: test_mm_srl_epi16:
2999; X32:       # BB#0:
3000; X32-NEXT:    psrlw %xmm1, %xmm0
3001; X32-NEXT:    retl
3002;
3003; X64-LABEL: test_mm_srl_epi16:
3004; X64:       # BB#0:
3005; X64-NEXT:    psrlw %xmm1, %xmm0
3006; X64-NEXT:    retq
3007  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
3008  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
3009  %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %arg0, <8 x i16> %arg1)
3010  %bc = bitcast <8 x i16> %res to <2 x i64>
3011  ret <2 x i64> %bc
3012}
3013declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
3014
3015define <2 x i64> @test_mm_srl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
3016; X32-LABEL: test_mm_srl_epi32:
3017; X32:       # BB#0:
3018; X32-NEXT:    psrld %xmm1, %xmm0
3019; X32-NEXT:    retl
3020;
3021; X64-LABEL: test_mm_srl_epi32:
3022; X64:       # BB#0:
3023; X64-NEXT:    psrld %xmm1, %xmm0
3024; X64-NEXT:    retq
3025  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
3026  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
3027  %res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %arg0, <4 x i32> %arg1)
3028  %bc = bitcast <4 x i32> %res to <2 x i64>
3029  ret <2 x i64> %bc
3030}
3031declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
3032
3033define <2 x i64> @test_mm_srl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
3034; X32-LABEL: test_mm_srl_epi64:
3035; X32:       # BB#0:
3036; X32-NEXT:    psrlq %xmm1, %xmm0
3037; X32-NEXT:    retl
3038;
3039; X64-LABEL: test_mm_srl_epi64:
3040; X64:       # BB#0:
3041; X64-NEXT:    psrlq %xmm1, %xmm0
3042; X64-NEXT:    retq
3043  %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
3044  ret <2 x i64> %res
3045}
3046declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
3047
3048define <2 x i64> @test_mm_srli_epi16(<2 x i64> %a0) {
3049; X32-LABEL: test_mm_srli_epi16:
3050; X32:       # BB#0:
3051; X32-NEXT:    psrlw $1, %xmm0
3052; X32-NEXT:    retl
3053;
3054; X64-LABEL: test_mm_srli_epi16:
3055; X64:       # BB#0:
3056; X64-NEXT:    psrlw $1, %xmm0
3057; X64-NEXT:    retq
3058  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
3059  %res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %arg0, i32 1)
3060  %bc = bitcast <8 x i16> %res to <2 x i64>
3061  ret <2 x i64> %bc
3062}
3063declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
3064
3065define <2 x i64> @test_mm_srli_epi32(<2 x i64> %a0) {
3066; X32-LABEL: test_mm_srli_epi32:
3067; X32:       # BB#0:
3068; X32-NEXT:    psrld $1, %xmm0
3069; X32-NEXT:    retl
3070;
3071; X64-LABEL: test_mm_srli_epi32:
3072; X64:       # BB#0:
3073; X64-NEXT:    psrld $1, %xmm0
3074; X64-NEXT:    retq
3075  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
3076  %res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %arg0, i32 1)
3077  %bc = bitcast <4 x i32> %res to <2 x i64>
3078  ret <2 x i64> %bc
3079}
3080declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
3081
3082define <2 x i64> @test_mm_srli_epi64(<2 x i64> %a0) {
3083; X32-LABEL: test_mm_srli_epi64:
3084; X32:       # BB#0:
3085; X32-NEXT:    psrlq $1, %xmm0
3086; X32-NEXT:    retl
3087;
3088; X64-LABEL: test_mm_srli_epi64:
3089; X64:       # BB#0:
3090; X64-NEXT:    psrlq $1, %xmm0
3091; X64-NEXT:    retq
3092  %res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 1)
3093  ret <2 x i64> %res
3094}
3095declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
3096
3097define <2 x i64> @test_mm_srli_si128(<2 x i64> %a0) nounwind {
3098; X32-LABEL: test_mm_srli_si128:
3099; X32:       # BB#0:
3100; X32-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
3101; X32-NEXT:    retl
3102;
3103; X64-LABEL: test_mm_srli_si128:
3104; X64:       # BB#0:
3105; X64-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
3106; X64-NEXT:    retq
3107  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
3108  %res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
3109  %bc = bitcast <16 x i8> %res to <2 x i64>
3110  ret <2 x i64> %bc
3111}
3112
3113define void @test_mm_store_pd(double *%a0, <2 x double> %a1) {
3114; X32-LABEL: test_mm_store_pd:
3115; X32:       # BB#0:
3116; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3117; X32-NEXT:    movaps %xmm0, (%eax)
3118; X32-NEXT:    retl
3119;
3120; X64-LABEL: test_mm_store_pd:
3121; X64:       # BB#0:
3122; X64-NEXT:    movaps %xmm0, (%rdi)
3123; X64-NEXT:    retq
3124  %arg0 = bitcast double* %a0 to <2 x double>*
3125  store <2 x double> %a1, <2 x double>* %arg0, align 16
3126  ret void
3127}
3128
3129define void @test_mm_store_pd1(double *%a0, <2 x double> %a1) {
3130; X32-LABEL: test_mm_store_pd1:
3131; X32:       # BB#0:
3132; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3133; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
3134; X32-NEXT:    movaps %xmm0, (%eax)
3135; X32-NEXT:    retl
3136;
3137; X64-LABEL: test_mm_store_pd1:
3138; X64:       # BB#0:
3139; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
3140; X64-NEXT:    movaps %xmm0, (%rdi)
3141; X64-NEXT:    retq
3142  %arg0 = bitcast double * %a0 to <2 x double>*
3143  %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
3144  store <2 x double> %shuf, <2 x double>* %arg0, align 16
3145  ret void
3146}
3147
3148define void @test_mm_store_sd(double *%a0, <2 x double> %a1) {
3149; X32-LABEL: test_mm_store_sd:
3150; X32:       # BB#0:
3151; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3152; X32-NEXT:    movsd %xmm0, (%eax)
3153; X32-NEXT:    retl
3154;
3155; X64-LABEL: test_mm_store_sd:
3156; X64:       # BB#0:
3157; X64-NEXT:    movsd %xmm0, (%rdi)
3158; X64-NEXT:    retq
3159  %ext = extractelement <2 x double> %a1, i32 0
3160  store double %ext, double* %a0, align 1
3161  ret void
3162}
3163
3164define void @test_mm_store_si128(<2 x i64> *%a0, <2 x i64> %a1) {
3165; X32-LABEL: test_mm_store_si128:
3166; X32:       # BB#0:
3167; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3168; X32-NEXT:    movaps %xmm0, (%eax)
3169; X32-NEXT:    retl
3170;
3171; X64-LABEL: test_mm_store_si128:
3172; X64:       # BB#0:
3173; X64-NEXT:    movaps %xmm0, (%rdi)
3174; X64-NEXT:    retq
3175  store <2 x i64> %a1, <2 x i64>* %a0, align 16
3176  ret void
3177}
3178
3179define void @test_mm_store1_pd(double *%a0, <2 x double> %a1) {
3180; X32-LABEL: test_mm_store1_pd:
3181; X32:       # BB#0:
3182; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3183; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
3184; X32-NEXT:    movaps %xmm0, (%eax)
3185; X32-NEXT:    retl
3186;
3187; X64-LABEL: test_mm_store1_pd:
3188; X64:       # BB#0:
3189; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
3190; X64-NEXT:    movaps %xmm0, (%rdi)
3191; X64-NEXT:    retq
3192  %arg0 = bitcast double * %a0 to <2 x double>*
3193  %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
3194  store <2 x double> %shuf, <2 x double>* %arg0, align 16
3195  ret void
3196}
3197
3198define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
3199; X32-LABEL: test_mm_storeh_sd:
3200; X32:       # BB#0:
3201; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3202; X32-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
3203; X32-NEXT:    movsd %xmm0, (%eax)
3204; X32-NEXT:    retl
3205;
3206; X64-LABEL: test_mm_storeh_sd:
3207; X64:       # BB#0:
3208; X64-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
3209; X64-NEXT:    movsd %xmm0, (%rdi)
3210; X64-NEXT:    retq
3211  %ext = extractelement <2 x double> %a1, i32 1
3212  store double %ext, double* %a0, align 8
3213  ret void
3214}
3215
3216define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
3217; X32-LABEL: test_mm_storel_epi64:
3218; X32:       # BB#0:
3219; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3220; X32-NEXT:    movlps %xmm0, (%eax)
3221; X32-NEXT:    retl
3222;
3223; X64-LABEL: test_mm_storel_epi64:
3224; X64:       # BB#0:
3225; X64-NEXT:    movd %xmm0, %rax
3226; X64-NEXT:    movq %rax, (%rdi)
3227; X64-NEXT:    retq
3228  %ext = extractelement <2 x i64> %a1, i32 0
3229  %bc = bitcast <2 x i64> *%a0 to i64*
3230  store i64 %ext, i64* %bc, align 8
3231  ret void
3232}
3233
3234define void @test_mm_storel_sd(double *%a0, <2 x double> %a1) {
3235; X32-LABEL: test_mm_storel_sd:
3236; X32:       # BB#0:
3237; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3238; X32-NEXT:    movsd %xmm0, (%eax)
3239; X32-NEXT:    retl
3240;
3241; X64-LABEL: test_mm_storel_sd:
3242; X64:       # BB#0:
3243; X64-NEXT:    movsd %xmm0, (%rdi)
3244; X64-NEXT:    retq
3245  %ext = extractelement <2 x double> %a1, i32 0
3246  store double %ext, double* %a0, align 8
3247  ret void
3248}
3249
3250define void @test_mm_storer_pd(double *%a0, <2 x double> %a1) {
3251; X32-LABEL: test_mm_storer_pd:
3252; X32:       # BB#0:
3253; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3254; X32-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
3255; X32-NEXT:    movapd %xmm0, (%eax)
3256; X32-NEXT:    retl
3257;
3258; X64-LABEL: test_mm_storer_pd:
3259; X64:       # BB#0:
3260; X64-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
3261; X64-NEXT:    movapd %xmm0, (%rdi)
3262; X64-NEXT:    retq
3263  %arg0 = bitcast double* %a0 to <2 x double>*
3264  %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
3265  store <2 x double> %shuf, <2 x double>* %arg0, align 16
3266  ret void
3267}
3268
3269define void @test_mm_storeu_pd(double *%a0, <2 x double> %a1) {
3270; X32-LABEL: test_mm_storeu_pd:
3271; X32:       # BB#0:
3272; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3273; X32-NEXT:    movups %xmm0, (%eax)
3274; X32-NEXT:    retl
3275;
3276; X64-LABEL: test_mm_storeu_pd:
3277; X64:       # BB#0:
3278; X64-NEXT:    movups %xmm0, (%rdi)
3279; X64-NEXT:    retq
3280  %arg0 = bitcast double* %a0 to <2 x double>*
3281  store <2 x double> %a1, <2 x double>* %arg0, align 1
3282  ret void
3283}
3284
3285define void @test_mm_storeu_si128(<2 x i64> *%a0, <2 x i64> %a1) {
3286; X32-LABEL: test_mm_storeu_si128:
3287; X32:       # BB#0:
3288; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3289; X32-NEXT:    movups %xmm0, (%eax)
3290; X32-NEXT:    retl
3291;
3292; X64-LABEL: test_mm_storeu_si128:
3293; X64:       # BB#0:
3294; X64-NEXT:    movups %xmm0, (%rdi)
3295; X64-NEXT:    retq
3296  store <2 x i64> %a1, <2 x i64>* %a0, align 1
3297  ret void
3298}
3299
3300define void @test_mm_stream_pd(double *%a0, <2 x double> %a1) {
3301; X32-LABEL: test_mm_stream_pd:
3302; X32:       # BB#0:
3303; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3304; X32-NEXT:    movntps %xmm0, (%eax)
3305; X32-NEXT:    retl
3306;
3307; X64-LABEL: test_mm_stream_pd:
3308; X64:       # BB#0:
3309; X64-NEXT:    movntps %xmm0, (%rdi)
3310; X64-NEXT:    retq
3311  %arg0 = bitcast double* %a0 to <2 x double>*
3312  store <2 x double> %a1, <2 x double>* %arg0, align 16, !nontemporal !0
3313  ret void
3314}
3315
3316define void @test_mm_stream_si32(i32 *%a0, i32 %a1) {
3317; X32-LABEL: test_mm_stream_si32:
3318; X32:       # BB#0:
3319; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3320; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
3321; X32-NEXT:    movntil %eax, (%ecx)
3322; X32-NEXT:    retl
3323;
3324; X64-LABEL: test_mm_stream_si32:
3325; X64:       # BB#0:
3326; X64-NEXT:    movntil %esi, (%rdi)
3327; X64-NEXT:    retq
3328  store i32 %a1, i32* %a0, align 1, !nontemporal !0
3329  ret void
3330}
3331
3332define void @test_mm_stream_si128(<2 x i64> *%a0, <2 x i64> %a1) {
3333; X32-LABEL: test_mm_stream_si128:
3334; X32:       # BB#0:
3335; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3336; X32-NEXT:    movntps %xmm0, (%eax)
3337; X32-NEXT:    retl
3338;
3339; X64-LABEL: test_mm_stream_si128:
3340; X64:       # BB#0:
3341; X64-NEXT:    movntps %xmm0, (%rdi)
3342; X64-NEXT:    retq
3343  store <2 x i64> %a1, <2 x i64>* %a0, align 16, !nontemporal !0
3344  ret void
3345}
3346
3347define <2 x i64> @test_mm_sub_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3348; X32-LABEL: test_mm_sub_epi8:
3349; X32:       # BB#0:
3350; X32-NEXT:    psubb %xmm1, %xmm0
3351; X32-NEXT:    retl
3352;
3353; X64-LABEL: test_mm_sub_epi8:
3354; X64:       # BB#0:
3355; X64-NEXT:    psubb %xmm1, %xmm0
3356; X64-NEXT:    retq
3357  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
3358  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
3359  %res = sub <16 x i8> %arg0, %arg1
3360  %bc = bitcast <16 x i8> %res to <2 x i64>
3361  ret <2 x i64> %bc
3362}
3363
3364define <2 x i64> @test_mm_sub_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3365; X32-LABEL: test_mm_sub_epi16:
3366; X32:       # BB#0:
3367; X32-NEXT:    psubw %xmm1, %xmm0
3368; X32-NEXT:    retl
3369;
3370; X64-LABEL: test_mm_sub_epi16:
3371; X64:       # BB#0:
3372; X64-NEXT:    psubw %xmm1, %xmm0
3373; X64-NEXT:    retq
3374  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
3375  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
3376  %res = sub <8 x i16> %arg0, %arg1
3377  %bc = bitcast <8 x i16> %res to <2 x i64>
3378  ret <2 x i64> %bc
3379}
3380
3381define <2 x i64> @test_mm_sub_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3382; X32-LABEL: test_mm_sub_epi32:
3383; X32:       # BB#0:
3384; X32-NEXT:    psubd %xmm1, %xmm0
3385; X32-NEXT:    retl
3386;
3387; X64-LABEL: test_mm_sub_epi32:
3388; X64:       # BB#0:
3389; X64-NEXT:    psubd %xmm1, %xmm0
3390; X64-NEXT:    retq
3391  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
3392  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
3393  %res = sub <4 x i32> %arg0, %arg1
3394  %bc = bitcast <4 x i32> %res to <2 x i64>
3395  ret <2 x i64> %bc
3396}
3397
3398define <2 x i64> @test_mm_sub_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3399; X32-LABEL: test_mm_sub_epi64:
3400; X32:       # BB#0:
3401; X32-NEXT:    psubq %xmm1, %xmm0
3402; X32-NEXT:    retl
3403;
3404; X64-LABEL: test_mm_sub_epi64:
3405; X64:       # BB#0:
3406; X64-NEXT:    psubq %xmm1, %xmm0
3407; X64-NEXT:    retq
3408  %res = sub <2 x i64> %a0, %a1
3409  ret <2 x i64> %res
3410}
3411
3412define <2 x double> @test_mm_sub_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
3413; X32-LABEL: test_mm_sub_pd:
3414; X32:       # BB#0:
3415; X32-NEXT:    subpd %xmm1, %xmm0
3416; X32-NEXT:    retl
3417;
3418; X64-LABEL: test_mm_sub_pd:
3419; X64:       # BB#0:
3420; X64-NEXT:    subpd %xmm1, %xmm0
3421; X64-NEXT:    retq
3422  %res = fsub <2 x double> %a0, %a1
3423  ret <2 x double> %res
3424}
3425
3426define <2 x double> @test_mm_sub_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
3427; X32-LABEL: test_mm_sub_sd:
3428; X32:       # BB#0:
3429; X32-NEXT:    subsd %xmm1, %xmm0
3430; X32-NEXT:    retl
3431;
3432; X64-LABEL: test_mm_sub_sd:
3433; X64:       # BB#0:
3434; X64-NEXT:    subsd %xmm1, %xmm0
3435; X64-NEXT:    retq
3436  %ext0 = extractelement <2 x double> %a0, i32 0
3437  %ext1 = extractelement <2 x double> %a1, i32 0
3438  %fsub = fsub double %ext0, %ext1
3439  %res = insertelement <2 x double> %a0, double %fsub, i32 0
3440  ret <2 x double> %res
3441}
3442
3443define <2 x i64> @test_mm_subs_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3444; X32-LABEL: test_mm_subs_epi8:
3445; X32:       # BB#0:
3446; X32-NEXT:    psubsb %xmm1, %xmm0
3447; X32-NEXT:    retl
3448;
3449; X64-LABEL: test_mm_subs_epi8:
3450; X64:       # BB#0:
3451; X64-NEXT:    psubsb %xmm1, %xmm0
3452; X64-NEXT:    retq
3453  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
3454  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
3455  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1)
3456  %bc = bitcast <16 x i8> %res to <2 x i64>
3457  ret <2 x i64> %bc
3458}
3459declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
3460
3461define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3462; X32-LABEL: test_mm_subs_epi16:
3463; X32:       # BB#0:
3464; X32-NEXT:    psubsw %xmm1, %xmm0
3465; X32-NEXT:    retl
3466;
3467; X64-LABEL: test_mm_subs_epi16:
3468; X64:       # BB#0:
3469; X64-NEXT:    psubsw %xmm1, %xmm0
3470; X64-NEXT:    retq
3471  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
3472  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
3473  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1)
3474  %bc = bitcast <8 x i16> %res to <2 x i64>
3475  ret <2 x i64> %bc
3476}
3477declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
3478
3479define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3480; X32-LABEL: test_mm_subs_epu8:
3481; X32:       # BB#0:
3482; X32-NEXT:    psubusb %xmm1, %xmm0
3483; X32-NEXT:    retl
3484;
3485; X64-LABEL: test_mm_subs_epu8:
3486; X64:       # BB#0:
3487; X64-NEXT:    psubusb %xmm1, %xmm0
3488; X64-NEXT:    retq
3489  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
3490  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
3491  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1)
3492  %bc = bitcast <16 x i8> %res to <2 x i64>
3493  ret <2 x i64> %bc
3494}
3495declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
3496
3497define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3498; X32-LABEL: test_mm_subs_epu16:
3499; X32:       # BB#0:
3500; X32-NEXT:    psubusw %xmm1, %xmm0
3501; X32-NEXT:    retl
3502;
3503; X64-LABEL: test_mm_subs_epu16:
3504; X64:       # BB#0:
3505; X64-NEXT:    psubusw %xmm1, %xmm0
3506; X64-NEXT:    retq
3507  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
3508  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
3509  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1)
3510  %bc = bitcast <8 x i16> %res to <2 x i64>
3511  ret <2 x i64> %bc
3512}
3513declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
3514
3515define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
3516; X32-LABEL: test_mm_ucomieq_sd:
3517; X32:       # BB#0:
3518; X32-NEXT:    ucomisd %xmm1, %xmm0
3519; X32-NEXT:    setnp %al
3520; X32-NEXT:    sete %cl
3521; X32-NEXT:    andb %al, %cl
3522; X32-NEXT:    movzbl %cl, %eax
3523; X32-NEXT:    retl
3524;
3525; X64-LABEL: test_mm_ucomieq_sd:
3526; X64:       # BB#0:
3527; X64-NEXT:    ucomisd %xmm1, %xmm0
3528; X64-NEXT:    setnp %al
3529; X64-NEXT:    sete %cl
3530; X64-NEXT:    andb %al, %cl
3531; X64-NEXT:    movzbl %cl, %eax
3532; X64-NEXT:    retq
3533  %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
3534  ret i32 %res
3535}
3536declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
3537
3538define i32 @test_mm_ucomige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
3539; X32-LABEL: test_mm_ucomige_sd:
3540; X32:       # BB#0:
3541; X32-NEXT:    xorl %eax, %eax
3542; X32-NEXT:    ucomisd %xmm1, %xmm0
3543; X32-NEXT:    setae %al
3544; X32-NEXT:    retl
3545;
3546; X64-LABEL: test_mm_ucomige_sd:
3547; X64:       # BB#0:
3548; X64-NEXT:    xorl %eax, %eax
3549; X64-NEXT:    ucomisd %xmm1, %xmm0
3550; X64-NEXT:    setae %al
3551; X64-NEXT:    retq
3552  %res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1)
3553  ret i32 %res
3554}
3555declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readnone
3556
3557define i32 @test_mm_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
3558; X32-LABEL: test_mm_ucomigt_sd:
3559; X32:       # BB#0:
3560; X32-NEXT:    xorl %eax, %eax
3561; X32-NEXT:    ucomisd %xmm1, %xmm0
3562; X32-NEXT:    seta %al
3563; X32-NEXT:    retl
3564;
3565; X64-LABEL: test_mm_ucomigt_sd:
3566; X64:       # BB#0:
3567; X64-NEXT:    xorl %eax, %eax
3568; X64-NEXT:    ucomisd %xmm1, %xmm0
3569; X64-NEXT:    seta %al
3570; X64-NEXT:    retq
3571  %res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1)
3572  ret i32 %res
3573}
3574declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readnone
3575
3576define i32 @test_mm_ucomile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
3577; X32-LABEL: test_mm_ucomile_sd:
3578; X32:       # BB#0:
3579; X32-NEXT:    xorl %eax, %eax
3580; X32-NEXT:    ucomisd %xmm0, %xmm1
3581; X32-NEXT:    setae %al
3582; X32-NEXT:    retl
3583;
3584; X64-LABEL: test_mm_ucomile_sd:
3585; X64:       # BB#0:
3586; X64-NEXT:    xorl %eax, %eax
3587; X64-NEXT:    ucomisd %xmm0, %xmm1
3588; X64-NEXT:    setae %al
3589; X64-NEXT:    retq
3590  %res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1)
3591  ret i32 %res
3592}
3593declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readnone
3594
3595define i32 @test_mm_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
3596; X32-LABEL: test_mm_ucomilt_sd:
3597; X32:       # BB#0:
3598; X32-NEXT:    xorl %eax, %eax
3599; X32-NEXT:    ucomisd %xmm0, %xmm1
3600; X32-NEXT:    seta %al
3601; X32-NEXT:    retl
3602;
3603; X64-LABEL: test_mm_ucomilt_sd:
3604; X64:       # BB#0:
3605; X64-NEXT:    xorl %eax, %eax
3606; X64-NEXT:    ucomisd %xmm0, %xmm1
3607; X64-NEXT:    seta %al
3608; X64-NEXT:    retq
3609  %res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1)
3610  ret i32 %res
3611}
3612declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readnone
3613
3614define i32 @test_mm_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
3615; X32-LABEL: test_mm_ucomineq_sd:
3616; X32:       # BB#0:
3617; X32-NEXT:    ucomisd %xmm1, %xmm0
3618; X32-NEXT:    setp %al
3619; X32-NEXT:    setne %cl
3620; X32-NEXT:    orb %al, %cl
3621; X32-NEXT:    movzbl %cl, %eax
3622; X32-NEXT:    retl
3623;
3624; X64-LABEL: test_mm_ucomineq_sd:
3625; X64:       # BB#0:
3626; X64-NEXT:    ucomisd %xmm1, %xmm0
3627; X64-NEXT:    setp %al
3628; X64-NEXT:    setne %cl
3629; X64-NEXT:    orb %al, %cl
3630; X64-NEXT:    movzbl %cl, %eax
3631; X64-NEXT:    retq
3632  %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1)
3633  ret i32 %res
3634}
3635declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
3636
3637define <2 x double> @test_mm_undefined_pd() {
3638; X32-LABEL: test_mm_undefined_pd:
3639; X32:       # BB#0:
3640; X32-NEXT:    retl
3641;
3642; X64-LABEL: test_mm_undefined_pd:
3643; X64:       # BB#0:
3644; X64-NEXT:    retq
3645  ret <2 x double> undef
3646}
3647
3648define <2 x i64> @test_mm_undefined_si128() {
3649; X32-LABEL: test_mm_undefined_si128:
3650; X32:       # BB#0:
3651; X32-NEXT:    retl
3652;
3653; X64-LABEL: test_mm_undefined_si128:
3654; X64:       # BB#0:
3655; X64-NEXT:    retq
3656  ret <2 x i64> undef
3657}
3658
3659define <2 x i64> @test_mm_unpackhi_epi8(<2 x i64> %a0, <2 x i64> %a1) {
3660; X32-LABEL: test_mm_unpackhi_epi8:
3661; X32:       # BB#0:
3662; X32-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3663; X32-NEXT:    retl
3664;
3665; X64-LABEL: test_mm_unpackhi_epi8:
3666; X64:       # BB#0:
3667; X64-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3668; X64-NEXT:    retq
3669  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
3670  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
3671  %res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
3672  %bc = bitcast <16 x i8> %res to <2 x i64>
3673  ret <2 x i64> %bc
3674}
3675
3676define <2 x i64> @test_mm_unpackhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
3677; X32-LABEL: test_mm_unpackhi_epi16:
3678; X32:       # BB#0:
3679; X32-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3680; X32-NEXT:    retl
3681;
3682; X64-LABEL: test_mm_unpackhi_epi16:
3683; X64:       # BB#0:
3684; X64-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3685; X64-NEXT:    retq
3686  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
3687  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
3688  %res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
3689  %bc = bitcast <8 x i16> %res to <2 x i64>
3690  ret <2 x i64> %bc
3691}
3692
3693define <2 x i64> @test_mm_unpackhi_epi32(<2 x i64> %a0, <2 x i64> %a1) {
3694; X32-LABEL: test_mm_unpackhi_epi32:
3695; X32:       # BB#0:
3696; X32-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3697; X32-NEXT:    retl
3698;
3699; X64-LABEL: test_mm_unpackhi_epi32:
3700; X64:       # BB#0:
3701; X64-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3702; X64-NEXT:    retq
3703  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
3704  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
3705  %res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3706  %bc = bitcast <4 x i32> %res to <2 x i64>
3707  ret <2 x i64> %bc
3708}
3709
3710define <2 x i64> @test_mm_unpackhi_epi64(<2 x i64> %a0, <2 x i64> %a1) {
3711; X32-LABEL: test_mm_unpackhi_epi64:
3712; X32:       # BB#0:
3713; X32-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3714; X32-NEXT:    retl
3715;
3716; X64-LABEL: test_mm_unpackhi_epi64:
3717; X64:       # BB#0:
3718; X64-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3719; X64-NEXT:    retq
3720  %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
3721  ret <2 x i64> %res
3722}
3723
3724define <2 x double> @test_mm_unpackhi_pd(<2 x double> %a0, <2 x double> %a1) {
3725; X32-LABEL: test_mm_unpackhi_pd:
3726; X32:       # BB#0:
3727; X32-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3728; X32-NEXT:    retl
3729;
3730; X64-LABEL: test_mm_unpackhi_pd:
3731; X64:       # BB#0:
3732; X64-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3733; X64-NEXT:    retq
3734  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
3735  ret <2 x double> %res
3736}
3737
3738define <2 x i64> @test_mm_unpacklo_epi8(<2 x i64> %a0, <2 x i64> %a1) {
3739; X32-LABEL: test_mm_unpacklo_epi8:
3740; X32:       # BB#0:
3741; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3742; X32-NEXT:    retl
3743;
3744; X64-LABEL: test_mm_unpacklo_epi8:
3745; X64:       # BB#0:
3746; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3747; X64-NEXT:    retq
3748  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
3749  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
3750  %res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
3751  %bc = bitcast <16 x i8> %res to <2 x i64>
3752  ret <2 x i64> %bc
3753}
3754
3755define <2 x i64> @test_mm_unpacklo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
3756; X32-LABEL: test_mm_unpacklo_epi16:
3757; X32:       # BB#0:
3758; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3759; X32-NEXT:    retl
3760;
3761; X64-LABEL: test_mm_unpacklo_epi16:
3762; X64:       # BB#0:
3763; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3764; X64-NEXT:    retq
3765  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
3766  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
3767  %res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
3768  %bc = bitcast <8 x i16> %res to <2 x i64>
3769  ret <2 x i64> %bc
3770}
3771
3772define <2 x i64> @test_mm_unpacklo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
3773; X32-LABEL: test_mm_unpacklo_epi32:
3774; X32:       # BB#0:
3775; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3776; X32-NEXT:    retl
3777;
3778; X64-LABEL: test_mm_unpacklo_epi32:
3779; X64:       # BB#0:
3780; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3781; X64-NEXT:    retq
3782  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
3783  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
3784  %res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3785  %bc = bitcast <4 x i32> %res to <2 x i64>
3786  ret <2 x i64> %bc
3787}
3788
3789define <2 x i64> @test_mm_unpacklo_epi64(<2 x i64> %a0, <2 x i64> %a1) {
3790; X32-LABEL: test_mm_unpacklo_epi64:
3791; X32:       # BB#0:
3792; X32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3793; X32-NEXT:    retl
3794;
3795; X64-LABEL: test_mm_unpacklo_epi64:
3796; X64:       # BB#0:
3797; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3798; X64-NEXT:    retq
3799  %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
3800  ret <2 x i64> %res
3801}
3802
3803define <2 x double> @test_mm_unpacklo_pd(<2 x double> %a0, <2 x double> %a1) {
3804; X32-LABEL: test_mm_unpacklo_pd:
3805; X32:       # BB#0:
3806; X32-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3807; X32-NEXT:    retl
3808;
3809; X64-LABEL: test_mm_unpacklo_pd:
3810; X64:       # BB#0:
3811; X64-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3812; X64-NEXT:    retq
3813  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
3814  ret <2 x double> %res
3815}
3816
3817define <2 x double> @test_mm_xor_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
3818; X32-LABEL: test_mm_xor_pd:
3819; X32:       # BB#0:
3820; X32-NEXT:    xorps %xmm1, %xmm0
3821; X32-NEXT:    retl
3822;
3823; X64-LABEL: test_mm_xor_pd:
3824; X64:       # BB#0:
3825; X64-NEXT:    xorps %xmm1, %xmm0
3826; X64-NEXT:    retq
3827  %arg0 = bitcast <2 x double> %a0 to <4 x i32>
3828  %arg1 = bitcast <2 x double> %a1 to <4 x i32>
3829  %res = xor <4 x i32> %arg0, %arg1
3830  %bc = bitcast <4 x i32> %res to <2 x double>
3831  ret <2 x double> %bc
3832}
3833
3834define <2 x i64> @test_mm_xor_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
3835; X32-LABEL: test_mm_xor_si128:
3836; X32:       # BB#0:
3837; X32-NEXT:    xorps %xmm1, %xmm0
3838; X32-NEXT:    retl
3839;
3840; X64-LABEL: test_mm_xor_si128:
3841; X64:       # BB#0:
3842; X64-NEXT:    xorps %xmm1, %xmm0
3843; X64-NEXT:    retq
3844  %res = xor <2 x i64> %a0, %a1
3845  ret <2 x i64> %res
3846}
3847
3848!0 = !{i32 1}
3849
3850