1; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 2; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 3; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5 6define void @test1(i16* nocapture %head) nounwind { 7; SSE-LABEL: test1: 8; SSE: ## BB#0: ## %vector.ph 9; SSE-NEXT: movdqu (%rdi), %xmm0 10; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 11; SSE-NEXT: movdqu %xmm0, (%rdi) 12; SSE-NEXT: retq 13; 14; AVX-LABEL: test1: 15; AVX: ## BB#0: ## %vector.ph 16; AVX-NEXT: vmovdqu (%rdi), %xmm0 17; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 18; AVX-NEXT: vmovdqu %xmm0, (%rdi) 19; AVX-NEXT: retq 20vector.ph: 21 %0 = getelementptr inbounds i16, i16* %head, i64 0 22 %1 = bitcast i16* %0 to <8 x i16>* 23 %2 = load <8 x i16>, <8 x i16>* %1, align 2 24 %3 = icmp slt <8 x i16> %2, zeroinitializer 25 %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 26 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 27 store <8 x i16> %5, <8 x i16>* %1, align 2 28 ret void 29} 30 31define void @test2(i16* nocapture %head) nounwind { 32; SSE-LABEL: test2: 33; SSE: ## BB#0: ## %vector.ph 34; SSE-NEXT: movdqu (%rdi), %xmm0 35; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 36; SSE-NEXT: movdqu %xmm0, (%rdi) 37; SSE-NEXT: retq 38; 39; AVX-LABEL: test2: 40; AVX: ## BB#0: ## %vector.ph 41; AVX-NEXT: vmovdqu (%rdi), %xmm0 42; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 43; AVX-NEXT: vmovdqu %xmm0, (%rdi) 44; AVX-NEXT: retq 45vector.ph: 46 %0 = getelementptr inbounds i16, i16* %head, i64 0 47 %1 = bitcast i16* %0 to <8 x i16>* 48 %2 = load <8 x i16>, <8 x i16>* %1, align 2 49 %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 50 %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 51 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 52 store <8 x i16> %5, <8 x i16>* %1, align 2 53 ret void 54} 55 56define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind { 57; SSE2-LABEL: test3: 58; SSE2: ## BB#0: ## %vector.ph 59; SSE2-NEXT: movd %esi, %xmm0 60; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 61; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 62; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 63; SSE2-NEXT: movdqu (%rdi), %xmm1 64; SSE2-NEXT: psubusw %xmm0, %xmm1 65; SSE2-NEXT: movdqu %xmm1, (%rdi) 66; SSE2-NEXT: retq 67; 68; SSSE3-LABEL: test3: 69; SSSE3: ## BB#0: ## %vector.ph 70; SSSE3-NEXT: movd %esi, %xmm0 71; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 72; SSSE3-NEXT: movdqu (%rdi), %xmm1 73; SSSE3-NEXT: psubusw %xmm0, %xmm1 74; SSSE3-NEXT: movdqu %xmm1, (%rdi) 75; SSSE3-NEXT: retq 76; 77; AVX1-LABEL: test3: 78; AVX1: ## BB#0: ## %vector.ph 79; AVX1-NEXT: vmovd %esi, %xmm0 80; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 81; AVX1-NEXT: vmovdqu (%rdi), %xmm1 82; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0 83; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 84; AVX1-NEXT: retq 85; 86; AVX2-LABEL: test3: 87; AVX2: ## BB#0: ## %vector.ph 88; AVX2-NEXT: vmovd %esi, %xmm0 89; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 90; AVX2-NEXT: vmovdqu (%rdi), %xmm1 91; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0 92; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 93; AVX2-NEXT: retq 94vector.ph: 95 %0 = insertelement <8 x i16> undef, i16 %w, i32 0 96 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 97 %1 = getelementptr inbounds i16, i16* %head, i64 0 98 %2 = bitcast i16* %1 to <8 x i16>* 99 %3 = load <8 x i16>, <8 x i16>* %2, align 2 100 %4 = icmp ult <8 x i16> %3, %broadcast15 101 %5 = sub <8 x i16> %3, %broadcast15 102 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5 103 store <8 x i16> %6, <8 x i16>* %2, align 2 104 ret void 105} 106 107define void @test4(i8* nocapture %head) nounwind { 108; SSE-LABEL: test4: 109; SSE: ## BB#0: ## %vector.ph 110; SSE-NEXT: movdqu (%rdi), %xmm0 111; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 112; SSE-NEXT: movdqu %xmm0, (%rdi) 113; SSE-NEXT: retq 114; 115; AVX-LABEL: test4: 116; AVX: ## BB#0: ## %vector.ph 117; AVX-NEXT: vmovdqu (%rdi), %xmm0 118; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 119; AVX-NEXT: vmovdqu %xmm0, (%rdi) 120; AVX-NEXT: retq 121vector.ph: 122 %0 = getelementptr inbounds i8, i8* %head, i64 0 123 %1 = bitcast i8* %0 to <16 x i8>* 124 %2 = load <16 x i8>, <16 x i8>* %1, align 1 125 %3 = icmp slt <16 x i8> %2, zeroinitializer 126 %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 127 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 128 store <16 x i8> %5, <16 x i8>* %1, align 1 129 ret void 130} 131 132define void @test5(i8* nocapture %head) nounwind { 133; SSE-LABEL: test5: 134; SSE: ## BB#0: ## %vector.ph 135; SSE-NEXT: movdqu (%rdi), %xmm0 136; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 137; SSE-NEXT: movdqu %xmm0, (%rdi) 138; SSE-NEXT: retq 139; 140; AVX-LABEL: test5: 141; AVX: ## BB#0: ## %vector.ph 142; AVX-NEXT: vmovdqu (%rdi), %xmm0 143; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 144; AVX-NEXT: vmovdqu %xmm0, (%rdi) 145; AVX-NEXT: retq 146vector.ph: 147 %0 = getelementptr inbounds i8, i8* %head, i64 0 148 %1 = bitcast i8* %0 to <16 x i8>* 149 %2 = load <16 x i8>, <16 x i8>* %1, align 1 150 %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 151 %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 152 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 153 store <16 x i8> %5, <16 x i8>* %1, align 1 154 ret void 155} 156 157define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind { 158; SSE2-LABEL: test6: 159; SSE2: ## BB#0: ## %vector.ph 160; SSE2-NEXT: movd %esi, %xmm0 161; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 162; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 163; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 164; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 165; SSE2-NEXT: movdqu (%rdi), %xmm1 166; SSE2-NEXT: psubusb %xmm0, %xmm1 167; SSE2-NEXT: movdqu %xmm1, (%rdi) 168; SSE2-NEXT: retq 169; 170; SSSE3-LABEL: test6: 171; SSSE3: ## BB#0: ## %vector.ph 172; SSSE3-NEXT: movd %esi, %xmm0 173; SSSE3-NEXT: pxor %xmm1, %xmm1 174; SSSE3-NEXT: pshufb %xmm1, %xmm0 175; SSSE3-NEXT: movdqu (%rdi), %xmm1 176; SSSE3-NEXT: psubusb %xmm0, %xmm1 177; SSSE3-NEXT: movdqu %xmm1, (%rdi) 178; SSSE3-NEXT: retq 179; 180; AVX1-LABEL: test6: 181; AVX1: ## BB#0: ## %vector.ph 182; AVX1-NEXT: vmovd %esi, %xmm0 183; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 184; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 185; AVX1-NEXT: vmovdqu (%rdi), %xmm1 186; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0 187; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 188; AVX1-NEXT: retq 189; 190; AVX2-LABEL: test6: 191; AVX2: ## BB#0: ## %vector.ph 192; AVX2-NEXT: vmovd %esi, %xmm0 193; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 194; AVX2-NEXT: vmovdqu (%rdi), %xmm1 195; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0 196; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 197; AVX2-NEXT: retq 198vector.ph: 199 %0 = insertelement <16 x i8> undef, i8 %w, i32 0 200 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 201 %1 = getelementptr inbounds i8, i8* %head, i64 0 202 %2 = bitcast i8* %1 to <16 x i8>* 203 %3 = load <16 x i8>, <16 x i8>* %2, align 1 204 %4 = icmp ult <16 x i8> %3, %broadcast15 205 %5 = sub <16 x i8> %3, %broadcast15 206 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5 207 store <16 x i8> %6, <16 x i8>* %2, align 1 208 ret void 209} 210 211define void @test7(i16* nocapture %head) nounwind { 212; SSE-LABEL: test7: 213; SSE: ## BB#0: ## %vector.ph 214; SSE-NEXT: movdqu (%rdi), %xmm0 215; SSE-NEXT: movdqu 16(%rdi), %xmm1 216; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 217; SSE-NEXT: psubusw %xmm2, %xmm0 218; SSE-NEXT: psubusw %xmm2, %xmm1 219; SSE-NEXT: movdqu %xmm1, 16(%rdi) 220; SSE-NEXT: movdqu %xmm0, (%rdi) 221; SSE-NEXT: retq 222; 223; AVX1-LABEL: test7: 224; AVX1: ## BB#0: ## %vector.ph 225; AVX1-NEXT: vmovups (%rdi), %ymm0 226; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 227; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 228; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 229; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 230; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 231; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 232; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 233; AVX1-NEXT: vmovups %ymm0, (%rdi) 234; AVX1-NEXT: vzeroupper 235; AVX1-NEXT: retq 236; 237; AVX2-LABEL: test7: 238; AVX2: ## BB#0: ## %vector.ph 239; AVX2-NEXT: vmovdqu (%rdi), %ymm0 240; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 241; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 242; AVX2-NEXT: vzeroupper 243; AVX2-NEXT: retq 244vector.ph: 245 %0 = getelementptr inbounds i16, i16* %head, i64 0 246 %1 = bitcast i16* %0 to <16 x i16>* 247 %2 = load <16 x i16>, <16 x i16>* %1, align 2 248 %3 = icmp slt <16 x i16> %2, zeroinitializer 249 %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 250 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 251 store <16 x i16> %5, <16 x i16>* %1, align 2 252 ret void 253} 254 255define void @test8(i16* nocapture %head) nounwind { 256; SSE-LABEL: test8: 257; SSE: ## BB#0: ## %vector.ph 258; SSE-NEXT: movdqu (%rdi), %xmm0 259; SSE-NEXT: movdqu 16(%rdi), %xmm1 260; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] 261; SSE-NEXT: psubusw %xmm2, %xmm0 262; SSE-NEXT: psubusw %xmm2, %xmm1 263; SSE-NEXT: movdqu %xmm1, 16(%rdi) 264; SSE-NEXT: movdqu %xmm0, (%rdi) 265; SSE-NEXT: retq 266; 267; AVX1-LABEL: test8: 268; AVX1: ## BB#0: ## %vector.ph 269; AVX1-NEXT: vmovups (%rdi), %ymm0 270; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 271; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 272; AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 273; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] 274; AVX1-NEXT: vpcmpgtw %xmm4, %xmm3, %xmm3 275; AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm2 276; AVX1-NEXT: vpcmpgtw %xmm4, %xmm2, %xmm2 277; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 278; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769] 279; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 280; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 281; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 282; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0 283; AVX1-NEXT: vmovups %ymm0, (%rdi) 284; AVX1-NEXT: vzeroupper 285; AVX1-NEXT: retq 286; 287; AVX2-LABEL: test8: 288; AVX2: ## BB#0: ## %vector.ph 289; AVX2-NEXT: vmovdqu (%rdi), %ymm0 290; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 291; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 292; AVX2-NEXT: vzeroupper 293; AVX2-NEXT: retq 294vector.ph: 295 %0 = getelementptr inbounds i16, i16* %head, i64 0 296 %1 = bitcast i16* %0 to <16 x i16>* 297 %2 = load <16 x i16>, <16 x i16>* %1, align 2 298 %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 299 %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 300 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 301 store <16 x i16> %5, <16 x i16>* %1, align 2 302 ret void 303 304} 305 306define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind { 307; SSE2-LABEL: test9: 308; SSE2: ## BB#0: ## %vector.ph 309; SSE2-NEXT: movd %esi, %xmm0 310; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 311; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 312; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 313; SSE2-NEXT: movdqu (%rdi), %xmm1 314; SSE2-NEXT: movdqu 16(%rdi), %xmm2 315; SSE2-NEXT: psubusw %xmm0, %xmm1 316; SSE2-NEXT: psubusw %xmm0, %xmm2 317; SSE2-NEXT: movdqu %xmm2, 16(%rdi) 318; SSE2-NEXT: movdqu %xmm1, (%rdi) 319; SSE2-NEXT: retq 320; 321; SSSE3-LABEL: test9: 322; SSSE3: ## BB#0: ## %vector.ph 323; SSSE3-NEXT: movd %esi, %xmm0 324; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 325; SSSE3-NEXT: movdqu (%rdi), %xmm1 326; SSSE3-NEXT: movdqu 16(%rdi), %xmm2 327; SSSE3-NEXT: psubusw %xmm0, %xmm1 328; SSSE3-NEXT: psubusw %xmm0, %xmm2 329; SSSE3-NEXT: movdqu %xmm2, 16(%rdi) 330; SSSE3-NEXT: movdqu %xmm1, (%rdi) 331; SSSE3-NEXT: retq 332; 333; AVX1-LABEL: test9: 334; AVX1: ## BB#0: ## %vector.ph 335; AVX1-NEXT: vmovups (%rdi), %ymm0 336; AVX1-NEXT: vmovd %esi, %xmm1 337; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 338; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 339; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3 340; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm4 341; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 342; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm4 343; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2 344; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 345; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 346; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 347; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 348; AVX1-NEXT: vmovups %ymm0, (%rdi) 349; AVX1-NEXT: vzeroupper 350; AVX1-NEXT: retq 351; 352; AVX2-LABEL: test9: 353; AVX2: ## BB#0: ## %vector.ph 354; AVX2-NEXT: vmovd %esi, %xmm0 355; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 356; AVX2-NEXT: vmovdqu (%rdi), %ymm1 357; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0 358; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 359; AVX2-NEXT: vzeroupper 360; AVX2-NEXT: retq 361vector.ph: 362 %0 = insertelement <16 x i16> undef, i16 %w, i32 0 363 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer 364 %1 = getelementptr inbounds i16, i16* %head, i64 0 365 %2 = bitcast i16* %1 to <16 x i16>* 366 %3 = load <16 x i16>, <16 x i16>* %2, align 2 367 %4 = icmp ult <16 x i16> %3, %broadcast15 368 %5 = sub <16 x i16> %3, %broadcast15 369 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5 370 store <16 x i16> %6, <16 x i16>* %2, align 2 371 ret void 372} 373 374define void @test10(i8* nocapture %head) nounwind { 375; SSE-LABEL: test10: 376; SSE: ## BB#0: ## %vector.ph 377; SSE-NEXT: movdqu (%rdi), %xmm0 378; SSE-NEXT: movdqu 16(%rdi), %xmm1 379; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 380; SSE-NEXT: psubusb %xmm2, %xmm0 381; SSE-NEXT: psubusb %xmm2, %xmm1 382; SSE-NEXT: movdqu %xmm1, 16(%rdi) 383; SSE-NEXT: movdqu %xmm0, (%rdi) 384; SSE-NEXT: retq 385; 386; AVX1-LABEL: test10: 387; AVX1: ## BB#0: ## %vector.ph 388; AVX1-NEXT: vmovups (%rdi), %ymm0 389; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 390; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 391; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 392; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 393; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 394; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 395; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 396; AVX1-NEXT: vmovups %ymm0, (%rdi) 397; AVX1-NEXT: vzeroupper 398; AVX1-NEXT: retq 399; 400; AVX2-LABEL: test10: 401; AVX2: ## BB#0: ## %vector.ph 402; AVX2-NEXT: vmovdqu (%rdi), %ymm0 403; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 404; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 405; AVX2-NEXT: vzeroupper 406; AVX2-NEXT: retq 407vector.ph: 408 %0 = getelementptr inbounds i8, i8* %head, i64 0 409 %1 = bitcast i8* %0 to <32 x i8>* 410 %2 = load <32 x i8>, <32 x i8>* %1, align 1 411 %3 = icmp slt <32 x i8> %2, zeroinitializer 412 %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 413 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 414 store <32 x i8> %5, <32 x i8>* %1, align 1 415 ret void 416 417} 418 419define void @test11(i8* nocapture %head) nounwind { 420; SSE-LABEL: test11: 421; SSE: ## BB#0: ## %vector.ph 422; SSE-NEXT: movdqu (%rdi), %xmm0 423; SSE-NEXT: movdqu 16(%rdi), %xmm1 424; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 425; SSE-NEXT: psubusb %xmm2, %xmm0 426; SSE-NEXT: psubusb %xmm2, %xmm1 427; SSE-NEXT: movdqu %xmm1, 16(%rdi) 428; SSE-NEXT: movdqu %xmm0, (%rdi) 429; SSE-NEXT: retq 430; 431; AVX1-LABEL: test11: 432; AVX1: ## BB#0: ## %vector.ph 433; AVX1-NEXT: vmovups (%rdi), %ymm0 434; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 435; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 436; AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 437; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] 438; AVX1-NEXT: vpcmpgtb %xmm4, %xmm3, %xmm3 439; AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm2 440; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm2 441; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 442; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] 443; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 444; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 445; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 446; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0 447; AVX1-NEXT: vmovups %ymm0, (%rdi) 448; AVX1-NEXT: vzeroupper 449; AVX1-NEXT: retq 450; 451; AVX2-LABEL: test11: 452; AVX2: ## BB#0: ## %vector.ph 453; AVX2-NEXT: vmovdqu (%rdi), %ymm0 454; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 455; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 456; AVX2-NEXT: vzeroupper 457; AVX2-NEXT: retq 458vector.ph: 459 %0 = getelementptr inbounds i8, i8* %head, i64 0 460 %1 = bitcast i8* %0 to <32 x i8>* 461 %2 = load <32 x i8>, <32 x i8>* %1, align 1 462 %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 463 %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 464 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 465 store <32 x i8> %5, <32 x i8>* %1, align 1 466 ret void 467} 468 469define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind { 470; SSE2-LABEL: test12: 471; SSE2: ## BB#0: ## %vector.ph 472; SSE2-NEXT: movd %esi, %xmm0 473; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 474; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 475; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 476; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 477; SSE2-NEXT: movdqu (%rdi), %xmm1 478; SSE2-NEXT: movdqu 16(%rdi), %xmm2 479; SSE2-NEXT: psubusb %xmm0, %xmm1 480; SSE2-NEXT: psubusb %xmm0, %xmm2 481; SSE2-NEXT: movdqu %xmm2, 16(%rdi) 482; SSE2-NEXT: movdqu %xmm1, (%rdi) 483; SSE2-NEXT: retq 484; 485; SSSE3-LABEL: test12: 486; SSSE3: ## BB#0: ## %vector.ph 487; SSSE3-NEXT: movd %esi, %xmm0 488; SSSE3-NEXT: pxor %xmm1, %xmm1 489; SSSE3-NEXT: pshufb %xmm1, %xmm0 490; SSSE3-NEXT: movdqu (%rdi), %xmm1 491; SSSE3-NEXT: movdqu 16(%rdi), %xmm2 492; SSSE3-NEXT: psubusb %xmm0, %xmm1 493; SSSE3-NEXT: psubusb %xmm0, %xmm2 494; SSSE3-NEXT: movdqu %xmm2, 16(%rdi) 495; SSSE3-NEXT: movdqu %xmm1, (%rdi) 496; SSSE3-NEXT: retq 497; 498; AVX1-LABEL: test12: 499; AVX1: ## BB#0: ## %vector.ph 500; AVX1-NEXT: vmovups (%rdi), %ymm0 501; AVX1-NEXT: vmovd %esi, %xmm1 502; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 503; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 504; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 505; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 506; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm4 507; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 508; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm4 509; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 510; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 511; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 512; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 513; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 514; AVX1-NEXT: vmovups %ymm0, (%rdi) 515; AVX1-NEXT: vzeroupper 516; AVX1-NEXT: retq 517; 518; AVX2-LABEL: test12: 519; AVX2: ## BB#0: ## %vector.ph 520; AVX2-NEXT: vmovd %esi, %xmm0 521; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 522; AVX2-NEXT: vmovdqu (%rdi), %ymm1 523; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0 524; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 525; AVX2-NEXT: vzeroupper 526; AVX2-NEXT: retq 527vector.ph: 528 %0 = insertelement <32 x i8> undef, i8 %w, i32 0 529 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer 530 %1 = getelementptr inbounds i8, i8* %head, i64 0 531 %2 = bitcast i8* %1 to <32 x i8>* 532 %3 = load <32 x i8>, <32 x i8>* %2, align 1 533 %4 = icmp ult <32 x i8> %3, %broadcast15 534 %5 = sub <32 x i8> %3, %broadcast15 535 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5 536 store <32 x i8> %6, <32 x i8>* %2, align 1 537 ret void 538} 539