1; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSSE3 2; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1 3; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 4 5target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 6target triple = "x86_64-apple-macosx10.8.0" 7 8define void @test1(i16* nocapture %head) nounwind { 9vector.ph: 10 %0 = getelementptr inbounds i16, i16* %head, i64 0 11 %1 = bitcast i16* %0 to <8 x i16>* 12 %2 = load <8 x i16>, <8 x i16>* %1, align 2 13 %3 = icmp slt <8 x i16> %2, zeroinitializer 14 %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 15 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 16 store <8 x i16> %5, <8 x i16>* %1, align 2 17 ret void 18 19; SSSE3: @test1 20; SSSE3: # BB#0: 21; SSSE3-NEXT: movdqu (%rdi), %xmm0 22; SSSE3-NEXT: psubusw LCPI0_0(%rip), %xmm0 23; SSSE3-NEXT: movdqu %xmm0, (%rdi) 24; SSSE3-NEXT: retq 25 26; AVX1: @test1 27; AVX1: # BB#0: 28; AVX1-NEXT: vmovdqu (%rdi), %xmm0 29; AVX1-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0 30; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 31; AVX1-NEXT: retq 32 33; AVX2: @test1 34; AVX2: # BB#0: 35; AVX2-NEXT: vmovdqu (%rdi), %xmm0 36; AVX2-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0 37; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 38; AVX2-NEXT: retq 39} 40 41define void @test2(i16* nocapture %head) nounwind { 42vector.ph: 43 %0 = getelementptr inbounds i16, i16* %head, i64 0 44 %1 = bitcast i16* %0 to <8 x i16>* 45 %2 = load <8 x i16>, <8 x i16>* %1, align 2 46 %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 47 %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 48 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 49 store <8 x i16> %5, <8 x i16>* %1, align 2 50 ret void 51 52; SSSE3: @test2 53; SSSE3: # BB#0: 54; SSSE3-NEXT: movdqu (%rdi), %xmm0 55; SSSE3-NEXT: psubusw LCPI1_0(%rip), %xmm0 56; SSSE3-NEXT: movdqu %xmm0, (%rdi) 57; SSSE3-NEXT: retq 58 59; AVX1: @test2 60; AVX1: # BB#0: 61; AVX1-NEXT: vmovdqu (%rdi), %xmm0 62; AVX1-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0 63; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 64; AVX1-NEXT: retq 65 66; AVX2: @test2 67; AVX2: # BB#0: 68; AVX2-NEXT: vmovdqu (%rdi), %xmm0 69; AVX2-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0 70; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 71; AVX2-NEXT: retq 72} 73 74define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind { 75vector.ph: 76 %0 = insertelement <8 x i16> undef, i16 %w, i32 0 77 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 78 %1 = getelementptr inbounds i16, i16* %head, i64 0 79 %2 = bitcast i16* %1 to <8 x i16>* 80 %3 = load <8 x i16>, <8 x i16>* %2, align 2 81 %4 = icmp ult <8 x i16> %3, %broadcast15 82 %5 = sub <8 x i16> %3, %broadcast15 83 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5 84 store <8 x i16> %6, <8 x i16>* %2, align 2 85 ret void 86 87; SSSE3: @test3 88; SSSE3: # BB#0: 89; SSSE3-NEXT: movd %esi, %xmm0 90; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 91; SSSE3-NEXT: movdqu (%rdi), %xmm1 92; SSSE3-NEXT: psubusw %xmm0, %xmm1 93; SSSE3-NEXT: movdqu %xmm1, (%rdi) 94; SSSE3-NEXT: retq 95 96; AVX1: @test3 97; AVX1: # BB#0: 98; AVX1-NEXT: vmovd %esi, %xmm0 99; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 100; AVX1-NEXT: vmovdqu (%rdi), %xmm1 101; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0 102; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 103; AVX1-NEXT: retq 104 105; AVX2: @test3 106; AVX2: # BB#0: 107; AVX2-NEXT: vmovd %esi, %xmm0 108; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 109; AVX2-NEXT: vmovdqu (%rdi), %xmm1 110; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0 111; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 112; AVX2-NEXT: retq 113} 114 115define void @test4(i8* nocapture %head) nounwind { 116vector.ph: 117 %0 = getelementptr inbounds i8, i8* %head, i64 0 118 %1 = bitcast i8* %0 to <16 x i8>* 119 %2 = load <16 x i8>, <16 x i8>* %1, align 1 120 %3 = icmp slt <16 x i8> %2, zeroinitializer 121 %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 122 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 123 store <16 x i8> %5, <16 x i8>* %1, align 1 124 ret void 125 126; SSSE3: @test4 127; SSSE3: # BB#0: 128; SSSE3-NEXT: movdqu (%rdi), %xmm0 129; SSSE3-NEXT: psubusb LCPI3_0(%rip), %xmm0 130; SSSE3-NEXT: movdqu %xmm0, (%rdi) 131; SSSE3-NEXT: retq 132 133; AVX1: @test4 134; AVX1: # BB#0: 135; AVX1-NEXT: vmovdqu (%rdi), %xmm0 136; AVX1-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0 137; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 138; AVX1-NEXT: retq 139 140; AVX2: @test4 141; AVX2: # BB#0: 142; AVX2-NEXT: vmovdqu (%rdi), %xmm0 143; AVX2-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0 144; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 145; AVX2-NEXT: retq 146} 147 148define void @test5(i8* nocapture %head) nounwind { 149vector.ph: 150 %0 = getelementptr inbounds i8, i8* %head, i64 0 151 %1 = bitcast i8* %0 to <16 x i8>* 152 %2 = load <16 x i8>, <16 x i8>* %1, align 1 153 %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 154 %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 155 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 156 store <16 x i8> %5, <16 x i8>* %1, align 1 157 ret void 158 159; SSSE3: @test5 160; SSSE3: # BB#0: 161; SSSE3-NEXT: movdqu (%rdi), %xmm0 162; SSSE3-NEXT: psubusb LCPI4_0(%rip), %xmm0 163; SSSE3-NEXT: movdqu %xmm0, (%rdi) 164; SSSE3-NEXT: retq 165 166; AVX1: @test5 167; AVX1: # BB#0: 168; AVX1-NEXT: vmovdqu (%rdi), %xmm0 169; AVX1-NEXT: vpsubusb LCPI4_0(%rip), %xmm0 170; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 171; AVX1-NEXT: retq 172 173; AVX2: @test5 174; AVX2: # BB#0: 175; AVX2-NEXT: vmovdqu (%rdi), %xmm0 176; AVX2-NEXT: vpsubusb LCPI4_0(%rip), %xmm0 177; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 178; AVX2-NEXT: retq 179} 180 181define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind { 182vector.ph: 183 %0 = insertelement <16 x i8> undef, i8 %w, i32 0 184 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 185 %1 = getelementptr inbounds i8, i8* %head, i64 0 186 %2 = bitcast i8* %1 to <16 x i8>* 187 %3 = load <16 x i8>, <16 x i8>* %2, align 1 188 %4 = icmp ult <16 x i8> %3, %broadcast15 189 %5 = sub <16 x i8> %3, %broadcast15 190 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5 191 store <16 x i8> %6, <16 x i8>* %2, align 1 192 ret void 193 194; SSSE3: @test6 195; SSSE3: # BB#0: 196; SSSE3-NEXT: movd %esi, %xmm0 197; SSSE3-NEXT: pxor %xmm1, %xmm1 198; SSSE3-NEXT: pshufb %xmm1, %xmm0 199; SSSE3-NEXT: movdqu (%rdi), %xmm1 200; SSSE3-NEXT: psubusb %xmm0, %xmm1 201; SSSE3-NEXT: movdqu %xmm1, (%rdi) 202; SSSE3-NEXT: retq 203 204; AVX1: @test6 205; AVX1: # BB#0: 206; AVX1-NEXT: vmovd %esi, %xmm0 207; AVX1-NEXT: vpxor %xmm1, %xmm1 208; AVX1-NEXT: vpshufb %xmm1, %xmm0 209; AVX1-NEXT: vmovdqu (%rdi), %xmm1 210; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0 211; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 212; AVX1-NEXT: retq 213 214; AVX2: @test6 215; AVX2: # BB#0: 216; AVX2-NEXT: vmovd %esi, %xmm0 217; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 218; AVX2-NEXT: vmovdqu (%rdi), %xmm1 219; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0 220; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 221; AVX2-NEXT: retq 222} 223 224define void @test7(i16* nocapture %head) nounwind { 225vector.ph: 226 %0 = getelementptr inbounds i16, i16* %head, i64 0 227 %1 = bitcast i16* %0 to <16 x i16>* 228 %2 = load <16 x i16>, <16 x i16>* %1, align 2 229 %3 = icmp slt <16 x i16> %2, zeroinitializer 230 %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 231 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 232 store <16 x i16> %5, <16 x i16>* %1, align 2 233 ret void 234 235; AVX2: @test7 236; AVX2: # BB#0: 237; AVX2-NEXT: vmovdqu (%rdi), %ymm0 238; AVX2-NEXT: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0 239; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 240; AVX2-NEXT: vzeroupper 241; AVX2-NEXT: retq 242} 243 244define void @test8(i16* nocapture %head) nounwind { 245vector.ph: 246 %0 = getelementptr inbounds i16, i16* %head, i64 0 247 %1 = bitcast i16* %0 to <16 x i16>* 248 %2 = load <16 x i16>, <16 x i16>* %1, align 2 249 %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 250 %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 251 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 252 store <16 x i16> %5, <16 x i16>* %1, align 2 253 ret void 254 255; AVX2: @test8 256; AVX2: # BB#0: 257; AVX2-NEXT: vmovdqu (%rdi), %ymm0 258; AVX2-NEXT: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0 259; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 260; AVX2-NEXT: vzeroupper 261; AVX2-NEXT: retq 262} 263 264define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind { 265vector.ph: 266 %0 = insertelement <16 x i16> undef, i16 %w, i32 0 267 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer 268 %1 = getelementptr inbounds i16, i16* %head, i64 0 269 %2 = bitcast i16* %1 to <16 x i16>* 270 %3 = load <16 x i16>, <16 x i16>* %2, align 2 271 %4 = icmp ult <16 x i16> %3, %broadcast15 272 %5 = sub <16 x i16> %3, %broadcast15 273 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5 274 store <16 x i16> %6, <16 x i16>* %2, align 2 275 ret void 276 277; AVX2: @test9 278; AVX2: # BB#0: 279; AVX2-NEXT: vmovd %esi, %xmm0 280; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 281; AVX2-NEXT: vmovdqu (%rdi), %ymm1 282; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0 283; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 284; AVX2-NEXT: vzeroupper 285; AVX2-NEXT: retq 286} 287 288define void @test10(i8* nocapture %head) nounwind { 289vector.ph: 290 %0 = getelementptr inbounds i8, i8* %head, i64 0 291 %1 = bitcast i8* %0 to <32 x i8>* 292 %2 = load <32 x i8>, <32 x i8>* %1, align 1 293 %3 = icmp slt <32 x i8> %2, zeroinitializer 294 %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 295 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 296 store <32 x i8> %5, <32 x i8>* %1, align 1 297 ret void 298 299; AVX2: @test10 300; AVX2: # BB#0: 301; AVX2-NEXT: vmovdqu (%rdi), %ymm0 302; AVX2-NEXT: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0 303; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 304; AVX2-NEXT: vzeroupper 305; AVX2-NEXT: retq 306} 307 308define void @test11(i8* nocapture %head) nounwind { 309vector.ph: 310 %0 = getelementptr inbounds i8, i8* %head, i64 0 311 %1 = bitcast i8* %0 to <32 x i8>* 312 %2 = load <32 x i8>, <32 x i8>* %1, align 1 313 %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 314 %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 315 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 316 store <32 x i8> %5, <32 x i8>* %1, align 1 317 ret void 318 319; AVX2: @test11 320; AVX2: # BB#0: 321; AVX2-NEXT: vmovdqu (%rdi), %ymm0 322; AVX2-NEXT: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0 323; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 324; AVX2-NEXT: vzeroupper 325; AVX2-NEXT: retq 326} 327 328define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind { 329vector.ph: 330 %0 = insertelement <32 x i8> undef, i8 %w, i32 0 331 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer 332 %1 = getelementptr inbounds i8, i8* %head, i64 0 333 %2 = bitcast i8* %1 to <32 x i8>* 334 %3 = load <32 x i8>, <32 x i8>* %2, align 1 335 %4 = icmp ult <32 x i8> %3, %broadcast15 336 %5 = sub <32 x i8> %3, %broadcast15 337 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5 338 store <32 x i8> %6, <32 x i8>* %2, align 1 339 ret void 340 341; AVX2: @test12 342; AVX2: # BB#0: 343; AVX2-NEXT: vmovd %esi, %xmm0 344; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 345; AVX2-NEXT: vmovdqu (%rdi), %ymm1 346; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0 347; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 348; AVX2-NEXT: vzeroupper 349; AVX2-NEXT: retq 350} 351