1; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
2; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
3; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5
6define void @test1(i16* nocapture %head) nounwind {
7; SSE-LABEL: test1:
8; SSE:       ## BB#0: ## %vector.ph
9; SSE-NEXT:    movdqu (%rdi), %xmm0
10; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
11; SSE-NEXT:    movdqu %xmm0, (%rdi)
12; SSE-NEXT:    retq
13;
14; AVX-LABEL: test1:
15; AVX:       ## BB#0: ## %vector.ph
16; AVX-NEXT:    vmovdqu (%rdi), %xmm0
17; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
18; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
19; AVX-NEXT:    retq
20vector.ph:
21  %0 = getelementptr inbounds i16, i16* %head, i64 0
22  %1 = bitcast i16* %0 to <8 x i16>*
23  %2 = load <8 x i16>, <8 x i16>* %1, align 2
24  %3 = icmp slt <8 x i16> %2, zeroinitializer
25  %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
26  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
27  store <8 x i16> %5, <8 x i16>* %1, align 2
28  ret void
29}
30
31define void @test2(i16* nocapture %head) nounwind {
32; SSE-LABEL: test2:
33; SSE:       ## BB#0: ## %vector.ph
34; SSE-NEXT:    movdqu (%rdi), %xmm0
35; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
36; SSE-NEXT:    movdqu %xmm0, (%rdi)
37; SSE-NEXT:    retq
38;
39; AVX-LABEL: test2:
40; AVX:       ## BB#0: ## %vector.ph
41; AVX-NEXT:    vmovdqu (%rdi), %xmm0
42; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
43; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
44; AVX-NEXT:    retq
45vector.ph:
46  %0 = getelementptr inbounds i16, i16* %head, i64 0
47  %1 = bitcast i16* %0 to <8 x i16>*
48  %2 = load <8 x i16>, <8 x i16>* %1, align 2
49  %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
50  %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
51  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
52  store <8 x i16> %5, <8 x i16>* %1, align 2
53  ret void
54}
55
56define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
57; SSE-LABEL: test3:
58; SSE:       ## BB#0: ## %vector.ph
59; SSE-NEXT:    movd %esi, %xmm0
60; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
61; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
62; SSE-NEXT:    movdqu (%rdi), %xmm1
63; SSE-NEXT:    psubusw %xmm0, %xmm1
64; SSE-NEXT:    movdqu %xmm1, (%rdi)
65; SSE-NEXT:    retq
66;
67; AVX1-LABEL: test3:
68; AVX1:       ## BB#0: ## %vector.ph
69; AVX1-NEXT:    vmovd %esi, %xmm0
70; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
71; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
72; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
73; AVX1-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
74; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
75; AVX1-NEXT:    retq
76;
77; AVX2-LABEL: test3:
78; AVX2:       ## BB#0: ## %vector.ph
79; AVX2-NEXT:    vmovd %esi, %xmm0
80; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
81; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
82; AVX2-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
83; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
84; AVX2-NEXT:    retq
85vector.ph:
86  %0 = insertelement <8 x i16> undef, i16 %w, i32 0
87  %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
88  %1 = getelementptr inbounds i16, i16* %head, i64 0
89  %2 = bitcast i16* %1 to <8 x i16>*
90  %3 = load <8 x i16>, <8 x i16>* %2, align 2
91  %4 = icmp ult <8 x i16> %3, %broadcast15
92  %5 = sub <8 x i16> %3, %broadcast15
93  %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
94  store <8 x i16> %6, <8 x i16>* %2, align 2
95  ret void
96}
97
98define void @test4(i8* nocapture %head) nounwind {
99; SSE-LABEL: test4:
100; SSE:       ## BB#0: ## %vector.ph
101; SSE-NEXT:    movdqu (%rdi), %xmm0
102; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
103; SSE-NEXT:    movdqu %xmm0, (%rdi)
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: test4:
107; AVX:       ## BB#0: ## %vector.ph
108; AVX-NEXT:    vmovdqu (%rdi), %xmm0
109; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
110; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
111; AVX-NEXT:    retq
112vector.ph:
113  %0 = getelementptr inbounds i8, i8* %head, i64 0
114  %1 = bitcast i8* %0 to <16 x i8>*
115  %2 = load <16 x i8>, <16 x i8>* %1, align 1
116  %3 = icmp slt <16 x i8> %2, zeroinitializer
117  %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
118  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
119  store <16 x i8> %5, <16 x i8>* %1, align 1
120  ret void
121}
122
123define void @test5(i8* nocapture %head) nounwind {
124; SSE-LABEL: test5:
125; SSE:       ## BB#0: ## %vector.ph
126; SSE-NEXT:    movdqu (%rdi), %xmm0
127; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
128; SSE-NEXT:    movdqu %xmm0, (%rdi)
129; SSE-NEXT:    retq
130;
131; AVX-LABEL: test5:
132; AVX:       ## BB#0: ## %vector.ph
133; AVX-NEXT:    vmovdqu (%rdi), %xmm0
134; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
135; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
136; AVX-NEXT:    retq
137vector.ph:
138  %0 = getelementptr inbounds i8, i8* %head, i64 0
139  %1 = bitcast i8* %0 to <16 x i8>*
140  %2 = load <16 x i8>, <16 x i8>* %1, align 1
141  %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
142  %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
143  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
144  store <16 x i8> %5, <16 x i8>* %1, align 1
145  ret void
146}
147
148define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
149; SSE2-LABEL: test6:
150; SSE2:       ## BB#0: ## %vector.ph
151; SSE2-NEXT:    movd %esi, %xmm0
152; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
153; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
154; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
155; SSE2-NEXT:    movdqu (%rdi), %xmm1
156; SSE2-NEXT:    psubusb %xmm0, %xmm1
157; SSE2-NEXT:    movdqu %xmm1, (%rdi)
158; SSE2-NEXT:    retq
159;
160; SSSE3-LABEL: test6:
161; SSSE3:       ## BB#0: ## %vector.ph
162; SSSE3-NEXT:    movd %esi, %xmm0
163; SSSE3-NEXT:    pxor %xmm1, %xmm1
164; SSSE3-NEXT:    pshufb %xmm1, %xmm0
165; SSSE3-NEXT:    movdqu (%rdi), %xmm1
166; SSSE3-NEXT:    psubusb %xmm0, %xmm1
167; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
168; SSSE3-NEXT:    retq
169;
170; AVX1-LABEL: test6:
171; AVX1:       ## BB#0: ## %vector.ph
172; AVX1-NEXT:    vmovd %esi, %xmm0
173; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
174; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
175; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
176; AVX1-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
177; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
178; AVX1-NEXT:    retq
179;
180; AVX2-LABEL: test6:
181; AVX2:       ## BB#0: ## %vector.ph
182; AVX2-NEXT:    vmovd %esi, %xmm0
183; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
184; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
185; AVX2-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
186; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
187; AVX2-NEXT:    retq
188vector.ph:
189  %0 = insertelement <16 x i8> undef, i8 %w, i32 0
190  %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
191  %1 = getelementptr inbounds i8, i8* %head, i64 0
192  %2 = bitcast i8* %1 to <16 x i8>*
193  %3 = load <16 x i8>, <16 x i8>* %2, align 1
194  %4 = icmp ult <16 x i8> %3, %broadcast15
195  %5 = sub <16 x i8> %3, %broadcast15
196  %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
197  store <16 x i8> %6, <16 x i8>* %2, align 1
198  ret void
199}
200
201define void @test7(i16* nocapture %head) nounwind {
202; SSE-LABEL: test7:
203; SSE:       ## BB#0: ## %vector.ph
204; SSE-NEXT:    movdqu (%rdi), %xmm0
205; SSE-NEXT:    movdqu 16(%rdi), %xmm1
206; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
207; SSE-NEXT:    psubusw %xmm2, %xmm0
208; SSE-NEXT:    psubusw %xmm2, %xmm1
209; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
210; SSE-NEXT:    movdqu %xmm0, (%rdi)
211; SSE-NEXT:    retq
212;
213; AVX1-LABEL: test7:
214; AVX1:       ## BB#0: ## %vector.ph
215; AVX1-NEXT:    vmovups (%rdi), %ymm0
216; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
217; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
218; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm1
219; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm2
220; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
221; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
222; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
223; AVX1-NEXT:    vmovups %ymm0, (%rdi)
224; AVX1-NEXT:    vzeroupper
225; AVX1-NEXT:    retq
226;
227; AVX2-LABEL: test7:
228; AVX2:       ## BB#0: ## %vector.ph
229; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
230; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
231; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
232; AVX2-NEXT:    vzeroupper
233; AVX2-NEXT:    retq
234vector.ph:
235  %0 = getelementptr inbounds i16, i16* %head, i64 0
236  %1 = bitcast i16* %0 to <16 x i16>*
237  %2 = load <16 x i16>, <16 x i16>* %1, align 2
238  %3 = icmp slt <16 x i16> %2, zeroinitializer
239  %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
240  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
241  store <16 x i16> %5, <16 x i16>* %1, align 2
242  ret void
243}
244
245define void @test8(i16* nocapture %head) nounwind {
246; SSE-LABEL: test8:
247; SSE:       ## BB#0: ## %vector.ph
248; SSE-NEXT:    movdqu (%rdi), %xmm0
249; SSE-NEXT:    movdqu 16(%rdi), %xmm1
250; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
251; SSE-NEXT:    psubusw %xmm2, %xmm0
252; SSE-NEXT:    psubusw %xmm2, %xmm1
253; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
254; SSE-NEXT:    movdqu %xmm0, (%rdi)
255; SSE-NEXT:    retq
256;
257; AVX1-LABEL: test8:
258; AVX1:       ## BB#0: ## %vector.ph
259; AVX1-NEXT:    vmovups (%rdi), %ymm0
260; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
261; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
262; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
263; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
264; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm3, %xmm3
265; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
266; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm2, %xmm2
267; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
268; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
269; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
270; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
271; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
272; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
273; AVX1-NEXT:    vmovups %ymm0, (%rdi)
274; AVX1-NEXT:    vzeroupper
275; AVX1-NEXT:    retq
276;
277; AVX2-LABEL: test8:
278; AVX2:       ## BB#0: ## %vector.ph
279; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
280; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
281; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
282; AVX2-NEXT:    vzeroupper
283; AVX2-NEXT:    retq
284vector.ph:
285  %0 = getelementptr inbounds i16, i16* %head, i64 0
286  %1 = bitcast i16* %0 to <16 x i16>*
287  %2 = load <16 x i16>, <16 x i16>* %1, align 2
288  %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
289  %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
290  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
291  store <16 x i16> %5, <16 x i16>* %1, align 2
292  ret void
293
294}
295
296define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
297; SSE-LABEL: test9:
298; SSE:       ## BB#0: ## %vector.ph
299; SSE-NEXT:    movd %esi, %xmm0
300; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
301; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
302; SSE-NEXT:    movdqu (%rdi), %xmm1
303; SSE-NEXT:    movdqu 16(%rdi), %xmm2
304; SSE-NEXT:    psubusw %xmm0, %xmm1
305; SSE-NEXT:    psubusw %xmm0, %xmm2
306; SSE-NEXT:    movdqu %xmm2, 16(%rdi)
307; SSE-NEXT:    movdqu %xmm1, (%rdi)
308; SSE-NEXT:    retq
309;
310; AVX1-LABEL: test9:
311; AVX1:       ## BB#0: ## %vector.ph
312; AVX1-NEXT:    vmovups (%rdi), %ymm0
313; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
314; AVX1-NEXT:    vmovd %esi, %xmm2
315; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
316; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
317; AVX1-NEXT:    vpsubw %xmm2, %xmm1, %xmm3
318; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm4
319; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
320; AVX1-NEXT:    vpmaxuw %xmm2, %xmm1, %xmm4
321; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm1, %xmm1
322; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm2
323; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
324; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
325; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
326; AVX1-NEXT:    vmovups %ymm0, (%rdi)
327; AVX1-NEXT:    vzeroupper
328; AVX1-NEXT:    retq
329;
330; AVX2-LABEL: test9:
331; AVX2:       ## BB#0: ## %vector.ph
332; AVX2-NEXT:    vmovd %esi, %xmm0
333; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
334; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
335; AVX2-NEXT:    vpsubusw %ymm0, %ymm1, %ymm0
336; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
337; AVX2-NEXT:    vzeroupper
338; AVX2-NEXT:    retq
339vector.ph:
340  %0 = insertelement <16 x i16> undef, i16 %w, i32 0
341  %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
342  %1 = getelementptr inbounds i16, i16* %head, i64 0
343  %2 = bitcast i16* %1 to <16 x i16>*
344  %3 = load <16 x i16>, <16 x i16>* %2, align 2
345  %4 = icmp ult <16 x i16> %3, %broadcast15
346  %5 = sub <16 x i16> %3, %broadcast15
347  %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
348  store <16 x i16> %6, <16 x i16>* %2, align 2
349  ret void
350}
351
352define void @test10(i8* nocapture %head) nounwind {
353; SSE-LABEL: test10:
354; SSE:       ## BB#0: ## %vector.ph
355; SSE-NEXT:    movdqu (%rdi), %xmm0
356; SSE-NEXT:    movdqu 16(%rdi), %xmm1
357; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
358; SSE-NEXT:    psubusb %xmm2, %xmm0
359; SSE-NEXT:    psubusb %xmm2, %xmm1
360; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
361; SSE-NEXT:    movdqu %xmm0, (%rdi)
362; SSE-NEXT:    retq
363;
364; AVX1-LABEL: test10:
365; AVX1:       ## BB#0: ## %vector.ph
366; AVX1-NEXT:    vmovups (%rdi), %ymm0
367; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
368; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
369; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
370; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
371; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
372; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
373; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
374; AVX1-NEXT:    vmovups %ymm0, (%rdi)
375; AVX1-NEXT:    vzeroupper
376; AVX1-NEXT:    retq
377;
378; AVX2-LABEL: test10:
379; AVX2:       ## BB#0: ## %vector.ph
380; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
381; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
382; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
383; AVX2-NEXT:    vzeroupper
384; AVX2-NEXT:    retq
385vector.ph:
386  %0 = getelementptr inbounds i8, i8* %head, i64 0
387  %1 = bitcast i8* %0 to <32 x i8>*
388  %2 = load <32 x i8>, <32 x i8>* %1, align 1
389  %3 = icmp slt <32 x i8> %2, zeroinitializer
390  %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
391  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
392  store <32 x i8> %5, <32 x i8>* %1, align 1
393  ret void
394
395}
396
397define void @test11(i8* nocapture %head) nounwind {
398; SSE-LABEL: test11:
399; SSE:       ## BB#0: ## %vector.ph
400; SSE-NEXT:    movdqu (%rdi), %xmm0
401; SSE-NEXT:    movdqu 16(%rdi), %xmm1
402; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
403; SSE-NEXT:    psubusb %xmm2, %xmm0
404; SSE-NEXT:    psubusb %xmm2, %xmm1
405; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
406; SSE-NEXT:    movdqu %xmm0, (%rdi)
407; SSE-NEXT:    retq
408;
409; AVX1-LABEL: test11:
410; AVX1:       ## BB#0: ## %vector.ph
411; AVX1-NEXT:    vmovups (%rdi), %ymm0
412; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
413; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
414; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
415; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
416; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm3, %xmm3
417; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
418; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm2, %xmm2
419; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
420; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
421; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
422; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
423; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
424; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
425; AVX1-NEXT:    vmovups %ymm0, (%rdi)
426; AVX1-NEXT:    vzeroupper
427; AVX1-NEXT:    retq
428;
429; AVX2-LABEL: test11:
430; AVX2:       ## BB#0: ## %vector.ph
431; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
432; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
433; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
434; AVX2-NEXT:    vzeroupper
435; AVX2-NEXT:    retq
436vector.ph:
437  %0 = getelementptr inbounds i8, i8* %head, i64 0
438  %1 = bitcast i8* %0 to <32 x i8>*
439  %2 = load <32 x i8>, <32 x i8>* %1, align 1
440  %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
441  %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
442  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
443  store <32 x i8> %5, <32 x i8>* %1, align 1
444  ret void
445}
446
447define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
448; SSE2-LABEL: test12:
449; SSE2:       ## BB#0: ## %vector.ph
450; SSE2-NEXT:    movd %esi, %xmm0
451; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
452; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
453; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
454; SSE2-NEXT:    movdqu (%rdi), %xmm1
455; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
456; SSE2-NEXT:    psubusb %xmm0, %xmm1
457; SSE2-NEXT:    psubusb %xmm0, %xmm2
458; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
459; SSE2-NEXT:    movdqu %xmm1, (%rdi)
460; SSE2-NEXT:    retq
461;
462; SSSE3-LABEL: test12:
463; SSSE3:       ## BB#0: ## %vector.ph
464; SSSE3-NEXT:    movd %esi, %xmm0
465; SSSE3-NEXT:    pxor %xmm1, %xmm1
466; SSSE3-NEXT:    pshufb %xmm1, %xmm0
467; SSSE3-NEXT:    movdqu (%rdi), %xmm1
468; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
469; SSSE3-NEXT:    psubusb %xmm0, %xmm1
470; SSSE3-NEXT:    psubusb %xmm0, %xmm2
471; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
472; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
473; SSSE3-NEXT:    retq
474;
475; AVX1-LABEL: test12:
476; AVX1:       ## BB#0: ## %vector.ph
477; AVX1-NEXT:    vmovups (%rdi), %ymm0
478; AVX1-NEXT:    vmovd %esi, %xmm1
479; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
480; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
481; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
482; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
483; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm4
484; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
485; AVX1-NEXT:    vpmaxub %xmm1, %xmm2, %xmm4
486; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
487; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
488; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
489; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
490; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
491; AVX1-NEXT:    vmovups %ymm0, (%rdi)
492; AVX1-NEXT:    vzeroupper
493; AVX1-NEXT:    retq
494;
495; AVX2-LABEL: test12:
496; AVX2:       ## BB#0: ## %vector.ph
497; AVX2-NEXT:    vmovd %esi, %xmm0
498; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
499; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
500; AVX2-NEXT:    vpsubusb %ymm0, %ymm1, %ymm0
501; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
502; AVX2-NEXT:    vzeroupper
503; AVX2-NEXT:    retq
504vector.ph:
505  %0 = insertelement <32 x i8> undef, i8 %w, i32 0
506  %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
507  %1 = getelementptr inbounds i8, i8* %head, i64 0
508  %2 = bitcast i8* %1 to <32 x i8>*
509  %3 = load <32 x i8>, <32 x i8>* %2, align 1
510  %4 = icmp ult <32 x i8> %3, %broadcast15
511  %5 = sub <32 x i8> %3, %broadcast15
512  %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
513  store <32 x i8> %6, <32 x i8>* %2, align 1
514  ret void
515}
516