1; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSSE3
2; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
3; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
4
5target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
6target triple = "x86_64-apple-macosx10.8.0"
7
8define void @test1(i16* nocapture %head) nounwind {
9vector.ph:
10  %0 = getelementptr inbounds i16, i16* %head, i64 0
11  %1 = bitcast i16* %0 to <8 x i16>*
12  %2 = load <8 x i16>, <8 x i16>* %1, align 2
13  %3 = icmp slt <8 x i16> %2, zeroinitializer
14  %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
15  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
16  store <8 x i16> %5, <8 x i16>* %1, align 2
17  ret void
18
19; SSSE3: @test1
20; SSSE3:      # BB#0:
21; SSSE3-NEXT: movdqu (%rdi), %xmm0
22; SSSE3-NEXT: psubusw LCPI0_0(%rip), %xmm0
23; SSSE3-NEXT: movdqu %xmm0, (%rdi)
24; SSSE3-NEXT: retq
25
26; AVX1: @test1
27; AVX1:      # BB#0:
28; AVX1-NEXT: vmovdqu (%rdi), %xmm0
29; AVX1-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
30; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
31; AVX1-NEXT: retq
32
33; AVX2: @test1
34; AVX2:      # BB#0:
35; AVX2-NEXT: vmovdqu (%rdi), %xmm0
36; AVX2-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
37; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
38; AVX2-NEXT: retq
39}
40
41define void @test2(i16* nocapture %head) nounwind {
42vector.ph:
43  %0 = getelementptr inbounds i16, i16* %head, i64 0
44  %1 = bitcast i16* %0 to <8 x i16>*
45  %2 = load <8 x i16>, <8 x i16>* %1, align 2
46  %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
47  %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
48  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
49  store <8 x i16> %5, <8 x i16>* %1, align 2
50  ret void
51
52; SSSE3: @test2
53; SSSE3:      # BB#0:
54; SSSE3-NEXT: movdqu (%rdi), %xmm0
55; SSSE3-NEXT: psubusw LCPI1_0(%rip), %xmm0
56; SSSE3-NEXT: movdqu %xmm0, (%rdi)
57; SSSE3-NEXT: retq
58
59; AVX1: @test2
60; AVX1:      # BB#0:
61; AVX1-NEXT: vmovdqu (%rdi), %xmm0
62; AVX1-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
63; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
64; AVX1-NEXT: retq
65
66; AVX2: @test2
67; AVX2:      # BB#0:
68; AVX2-NEXT: vmovdqu (%rdi), %xmm0
69; AVX2-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
70; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
71; AVX2-NEXT: retq
72}
73
74define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
75vector.ph:
76  %0 = insertelement <8 x i16> undef, i16 %w, i32 0
77  %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
78  %1 = getelementptr inbounds i16, i16* %head, i64 0
79  %2 = bitcast i16* %1 to <8 x i16>*
80  %3 = load <8 x i16>, <8 x i16>* %2, align 2
81  %4 = icmp ult <8 x i16> %3, %broadcast15
82  %5 = sub <8 x i16> %3, %broadcast15
83  %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
84  store <8 x i16> %6, <8 x i16>* %2, align 2
85  ret void
86
87; SSSE3: @test3
88; SSSE3:      # BB#0:
89; SSSE3-NEXT: movd %esi, %xmm0
90; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
91; SSSE3-NEXT: movdqu (%rdi), %xmm1
92; SSSE3-NEXT: psubusw %xmm0, %xmm1
93; SSSE3-NEXT: movdqu %xmm1, (%rdi)
94; SSSE3-NEXT: retq
95
96; AVX1: @test3
97; AVX1:      # BB#0:
98; AVX1-NEXT: vmovd %esi, %xmm0
99; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
100; AVX1-NEXT: vmovdqu (%rdi), %xmm1
101; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
102; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
103; AVX1-NEXT: retq
104
105; AVX2: @test3
106; AVX2:      # BB#0:
107; AVX2-NEXT: vmovd %esi, %xmm0
108; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
109; AVX2-NEXT: vmovdqu (%rdi), %xmm1
110; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
111; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
112; AVX2-NEXT: retq
113}
114
115define void @test4(i8* nocapture %head) nounwind {
116vector.ph:
117  %0 = getelementptr inbounds i8, i8* %head, i64 0
118  %1 = bitcast i8* %0 to <16 x i8>*
119  %2 = load <16 x i8>, <16 x i8>* %1, align 1
120  %3 = icmp slt <16 x i8> %2, zeroinitializer
121  %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
122  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
123  store <16 x i8> %5, <16 x i8>* %1, align 1
124  ret void
125
126; SSSE3: @test4
127; SSSE3:      # BB#0:
128; SSSE3-NEXT: movdqu (%rdi), %xmm0
129; SSSE3-NEXT: psubusb LCPI3_0(%rip), %xmm0
130; SSSE3-NEXT: movdqu %xmm0, (%rdi)
131; SSSE3-NEXT: retq
132
133; AVX1: @test4
134; AVX1:      # BB#0:
135; AVX1-NEXT: vmovdqu (%rdi), %xmm0
136; AVX1-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
137; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
138; AVX1-NEXT: retq
139
140; AVX2: @test4
141; AVX2:      # BB#0:
142; AVX2-NEXT: vmovdqu (%rdi), %xmm0
143; AVX2-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
144; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
145; AVX2-NEXT: retq
146}
147
148define void @test5(i8* nocapture %head) nounwind {
149vector.ph:
150  %0 = getelementptr inbounds i8, i8* %head, i64 0
151  %1 = bitcast i8* %0 to <16 x i8>*
152  %2 = load <16 x i8>, <16 x i8>* %1, align 1
153  %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
154  %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
155  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
156  store <16 x i8> %5, <16 x i8>* %1, align 1
157  ret void
158
159; SSSE3: @test5
160; SSSE3:      # BB#0:
161; SSSE3-NEXT: movdqu (%rdi), %xmm0
162; SSSE3-NEXT: psubusb LCPI4_0(%rip), %xmm0
163; SSSE3-NEXT: movdqu %xmm0, (%rdi)
164; SSSE3-NEXT: retq
165
166; AVX1: @test5
167; AVX1:      # BB#0:
168; AVX1-NEXT: vmovdqu (%rdi), %xmm0
169; AVX1-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
170; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
171; AVX1-NEXT: retq
172
173; AVX2: @test5
174; AVX2:      # BB#0:
175; AVX2-NEXT: vmovdqu (%rdi), %xmm0
176; AVX2-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
177; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
178; AVX2-NEXT: retq
179}
180
181define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
182vector.ph:
183  %0 = insertelement <16 x i8> undef, i8 %w, i32 0
184  %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
185  %1 = getelementptr inbounds i8, i8* %head, i64 0
186  %2 = bitcast i8* %1 to <16 x i8>*
187  %3 = load <16 x i8>, <16 x i8>* %2, align 1
188  %4 = icmp ult <16 x i8> %3, %broadcast15
189  %5 = sub <16 x i8> %3, %broadcast15
190  %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
191  store <16 x i8> %6, <16 x i8>* %2, align 1
192  ret void
193
194; SSSE3: @test6
195; SSSE3:      # BB#0:
196; SSSE3-NEXT: movd %esi, %xmm0
197; SSSE3-NEXT: pxor %xmm1, %xmm1
198; SSSE3-NEXT: pshufb %xmm1, %xmm0
199; SSSE3-NEXT: movdqu (%rdi), %xmm1
200; SSSE3-NEXT: psubusb %xmm0, %xmm1
201; SSSE3-NEXT: movdqu %xmm1, (%rdi)
202; SSSE3-NEXT: retq
203
204; AVX1: @test6
205; AVX1:      # BB#0:
206; AVX1-NEXT: vmovd %esi, %xmm0
207; AVX1-NEXT: vpxor %xmm1, %xmm1
208; AVX1-NEXT: vpshufb %xmm1, %xmm0
209; AVX1-NEXT: vmovdqu (%rdi), %xmm1
210; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
211; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
212; AVX1-NEXT: retq
213
214; AVX2: @test6
215; AVX2:      # BB#0:
216; AVX2-NEXT: vmovd %esi, %xmm0
217; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
218; AVX2-NEXT: vmovdqu (%rdi), %xmm1
219; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
220; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
221; AVX2-NEXT: retq
222}
223
224define void @test7(i16* nocapture %head) nounwind {
225vector.ph:
226  %0 = getelementptr inbounds i16, i16* %head, i64 0
227  %1 = bitcast i16* %0 to <16 x i16>*
228  %2 = load <16 x i16>, <16 x i16>* %1, align 2
229  %3 = icmp slt <16 x i16> %2, zeroinitializer
230  %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
231  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
232  store <16 x i16> %5, <16 x i16>* %1, align 2
233  ret void
234
235; AVX2: @test7
236; AVX2:      # BB#0:
237; AVX2-NEXT: vmovdqu (%rdi), %ymm0
238; AVX2-NEXT: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
239; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
240; AVX2-NEXT: vzeroupper
241; AVX2-NEXT: retq
242}
243
244define void @test8(i16* nocapture %head) nounwind {
245vector.ph:
246  %0 = getelementptr inbounds i16, i16* %head, i64 0
247  %1 = bitcast i16* %0 to <16 x i16>*
248  %2 = load <16 x i16>, <16 x i16>* %1, align 2
249  %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
250  %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
251  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
252  store <16 x i16> %5, <16 x i16>* %1, align 2
253  ret void
254
255; AVX2: @test8
256; AVX2:      # BB#0:
257; AVX2-NEXT: vmovdqu (%rdi), %ymm0
258; AVX2-NEXT: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
259; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
260; AVX2-NEXT: vzeroupper
261; AVX2-NEXT: retq
262}
263
264define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
265vector.ph:
266  %0 = insertelement <16 x i16> undef, i16 %w, i32 0
267  %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
268  %1 = getelementptr inbounds i16, i16* %head, i64 0
269  %2 = bitcast i16* %1 to <16 x i16>*
270  %3 = load <16 x i16>, <16 x i16>* %2, align 2
271  %4 = icmp ult <16 x i16> %3, %broadcast15
272  %5 = sub <16 x i16> %3, %broadcast15
273  %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
274  store <16 x i16> %6, <16 x i16>* %2, align 2
275  ret void
276
277; AVX2: @test9
278; AVX2:      # BB#0:
279; AVX2-NEXT: vmovd %esi, %xmm0
280; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
281; AVX2-NEXT: vmovdqu (%rdi), %ymm1
282; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0
283; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
284; AVX2-NEXT: vzeroupper
285; AVX2-NEXT: retq
286}
287
288define void @test10(i8* nocapture %head) nounwind {
289vector.ph:
290  %0 = getelementptr inbounds i8, i8* %head, i64 0
291  %1 = bitcast i8* %0 to <32 x i8>*
292  %2 = load <32 x i8>, <32 x i8>* %1, align 1
293  %3 = icmp slt <32 x i8> %2, zeroinitializer
294  %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
295  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
296  store <32 x i8> %5, <32 x i8>* %1, align 1
297  ret void
298
299; AVX2: @test10
300; AVX2:      # BB#0:
301; AVX2-NEXT: vmovdqu (%rdi), %ymm0
302; AVX2-NEXT: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
303; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
304; AVX2-NEXT: vzeroupper
305; AVX2-NEXT: retq
306}
307
308define void @test11(i8* nocapture %head) nounwind {
309vector.ph:
310  %0 = getelementptr inbounds i8, i8* %head, i64 0
311  %1 = bitcast i8* %0 to <32 x i8>*
312  %2 = load <32 x i8>, <32 x i8>* %1, align 1
313  %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
314  %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
315  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
316  store <32 x i8> %5, <32 x i8>* %1, align 1
317  ret void
318
319; AVX2: @test11
320; AVX2:      # BB#0:
321; AVX2-NEXT: vmovdqu (%rdi), %ymm0
322; AVX2-NEXT: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
323; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
324; AVX2-NEXT: vzeroupper
325; AVX2-NEXT: retq
326}
327
328define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
329vector.ph:
330  %0 = insertelement <32 x i8> undef, i8 %w, i32 0
331  %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
332  %1 = getelementptr inbounds i8, i8* %head, i64 0
333  %2 = bitcast i8* %1 to <32 x i8>*
334  %3 = load <32 x i8>, <32 x i8>* %2, align 1
335  %4 = icmp ult <32 x i8> %3, %broadcast15
336  %5 = sub <32 x i8> %3, %broadcast15
337  %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
338  store <32 x i8> %6, <32 x i8>* %2, align 1
339  ret void
340
341; AVX2: @test12
342; AVX2:      # BB#0:
343; AVX2-NEXT: vmovd %esi, %xmm0
344; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
345; AVX2-NEXT: vmovdqu (%rdi), %ymm1
346; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0
347; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
348; AVX2-NEXT: vzeroupper
349; AVX2-NEXT: retq
350}
351