1; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41 2; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE 3; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX 4 5target triple = "x86_64-unknown-unknown" 6 7define <4 x i32> @test1(<4 x i32> %a) #0 { 8; SSE41-LABEL: test1: 9; SSE41: # BB#0: 10; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 11; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 12; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 13; SSE41-NEXT: pmuludq %xmm2, %xmm3 14; SSE41-NEXT: pmuludq %xmm0, %xmm1 15; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 16; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 17; SSE41-NEXT: psubd %xmm1, %xmm0 18; SSE41-NEXT: psrld $1, %xmm0 19; SSE41-NEXT: paddd %xmm1, %xmm0 20; SSE41-NEXT: psrld $2, %xmm0 21; SSE41-NEXT: retq 22; 23; SSE-LABEL: test1: 24; SSE: # BB#0: 25; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 26; SSE-NEXT: movdqa %xmm0, %xmm2 27; SSE-NEXT: pmuludq %xmm1, %xmm2 28; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 29; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 30; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 31; SSE-NEXT: pmuludq %xmm1, %xmm3 32; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 33; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 34; SSE-NEXT: psubd %xmm2, %xmm0 35; SSE-NEXT: psrld $1, %xmm0 36; SSE-NEXT: paddd %xmm2, %xmm0 37; SSE-NEXT: psrld $2, %xmm0 38; SSE-NEXT: retq 39; 40; AVX-LABEL: test1: 41; AVX: # BB#0: 42; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 43; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 44; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 45; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 46; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 47; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 48; AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 49; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 50; AVX-NEXT: vpsrld $1, %xmm0, %xmm0 51; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 52; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 53; AVX-NEXT: retq 54 %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 55 ret <4 x i32> %div 56} 57 58define <8 x i32> @test2(<8 x i32> %a) #0 { 59; SSE41-LABEL: test2: 60; SSE41: # BB#0: 61; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] 62; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 63; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 64; SSE41-NEXT: pmuludq %xmm3, %xmm4 65; SSE41-NEXT: movdqa %xmm0, %xmm5 66; SSE41-NEXT: pmuludq %xmm2, %xmm5 67; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 68; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] 69; SSE41-NEXT: psubd %xmm5, %xmm0 70; SSE41-NEXT: psrld $1, %xmm0 71; SSE41-NEXT: paddd %xmm5, %xmm0 72; SSE41-NEXT: psrld $2, %xmm0 73; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 74; SSE41-NEXT: pmuludq %xmm3, %xmm4 75; SSE41-NEXT: pmuludq %xmm1, %xmm2 76; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 77; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 78; SSE41-NEXT: psubd %xmm2, %xmm1 79; SSE41-NEXT: psrld $1, %xmm1 80; SSE41-NEXT: paddd %xmm2, %xmm1 81; SSE41-NEXT: psrld $2, %xmm1 82; SSE41-NEXT: retq 83; 84; SSE-LABEL: test2: 85; SSE: # BB#0: 86; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] 87; SSE-NEXT: movdqa %xmm0, %xmm3 88; SSE-NEXT: pmuludq %xmm2, %xmm3 89; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 90; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 91; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 92; SSE-NEXT: pmuludq %xmm4, %xmm5 93; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 94; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 95; SSE-NEXT: psubd %xmm3, %xmm0 96; SSE-NEXT: psrld $1, %xmm0 97; SSE-NEXT: paddd %xmm3, %xmm0 98; SSE-NEXT: psrld $2, %xmm0 99; SSE-NEXT: pmuludq %xmm1, %xmm2 100; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 101; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 102; SSE-NEXT: pmuludq %xmm4, %xmm3 103; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 104; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 105; SSE-NEXT: psubd %xmm2, %xmm1 106; SSE-NEXT: psrld $1, %xmm1 107; SSE-NEXT: paddd %xmm2, %xmm1 108; SSE-NEXT: psrld $2, %xmm1 109; SSE-NEXT: retq 110; 111; AVX-LABEL: test2: 112; AVX: # BB#0: 113; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 114; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 115; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 116; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 117; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 118; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 119; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 120; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 121; AVX-NEXT: vpsrld $1, %ymm0, %ymm0 122; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 123; AVX-NEXT: vpsrld $2, %ymm0, %ymm0 124; AVX-NEXT: retq 125 %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> 126 ret <8 x i32> %div 127} 128 129define <8 x i16> @test3(<8 x i16> %a) #0 { 130; SSE41-LABEL: test3: 131; SSE41: # BB#0: 132; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] 133; SSE41-NEXT: pmulhuw %xmm0, %xmm1 134; SSE41-NEXT: psubw %xmm1, %xmm0 135; SSE41-NEXT: psrlw $1, %xmm0 136; SSE41-NEXT: paddw %xmm1, %xmm0 137; SSE41-NEXT: psrlw $2, %xmm0 138; SSE41-NEXT: retq 139; 140; SSE-LABEL: test3: 141; SSE: # BB#0: 142; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] 143; SSE-NEXT: pmulhuw %xmm0, %xmm1 144; SSE-NEXT: psubw %xmm1, %xmm0 145; SSE-NEXT: psrlw $1, %xmm0 146; SSE-NEXT: paddw %xmm1, %xmm0 147; SSE-NEXT: psrlw $2, %xmm0 148; SSE-NEXT: retq 149; 150; AVX-LABEL: test3: 151; AVX: # BB#0: 152; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 153; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 154; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 155; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 156; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 157; AVX-NEXT: retq 158 %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 159 ret <8 x i16> %div 160} 161 162define <16 x i16> @test4(<16 x i16> %a) #0 { 163; SSE41-LABEL: test4: 164; SSE41: # BB#0: 165; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] 166; SSE41-NEXT: movdqa %xmm0, %xmm3 167; SSE41-NEXT: pmulhuw %xmm2, %xmm3 168; SSE41-NEXT: psubw %xmm3, %xmm0 169; SSE41-NEXT: psrlw $1, %xmm0 170; SSE41-NEXT: paddw %xmm3, %xmm0 171; SSE41-NEXT: psrlw $2, %xmm0 172; SSE41-NEXT: pmulhuw %xmm1, %xmm2 173; SSE41-NEXT: psubw %xmm2, %xmm1 174; SSE41-NEXT: psrlw $1, %xmm1 175; SSE41-NEXT: paddw %xmm2, %xmm1 176; SSE41-NEXT: psrlw $2, %xmm1 177; SSE41-NEXT: retq 178; 179; SSE-LABEL: test4: 180; SSE: # BB#0: 181; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] 182; SSE-NEXT: movdqa %xmm0, %xmm3 183; SSE-NEXT: pmulhuw %xmm2, %xmm3 184; SSE-NEXT: psubw %xmm3, %xmm0 185; SSE-NEXT: psrlw $1, %xmm0 186; SSE-NEXT: paddw %xmm3, %xmm0 187; SSE-NEXT: psrlw $2, %xmm0 188; SSE-NEXT: pmulhuw %xmm1, %xmm2 189; SSE-NEXT: psubw %xmm2, %xmm1 190; SSE-NEXT: psrlw $1, %xmm1 191; SSE-NEXT: paddw %xmm2, %xmm1 192; SSE-NEXT: psrlw $2, %xmm1 193; SSE-NEXT: retq 194; 195; AVX-LABEL: test4: 196; AVX: # BB#0: 197; AVX-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 198; AVX-NEXT: vpsubw %ymm1, %ymm0, %ymm0 199; AVX-NEXT: vpsrlw $1, %ymm0, %ymm0 200; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 201; AVX-NEXT: vpsrlw $2, %ymm0, %ymm0 202; AVX-NEXT: retq 203 %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7> 204 ret <16 x i16> %div 205} 206 207define <8 x i16> @test5(<8 x i16> %a) #0 { 208; SSE41-LABEL: test5: 209; SSE41: # BB#0: 210; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0 211; SSE41-NEXT: movdqa %xmm0, %xmm1 212; SSE41-NEXT: psrlw $15, %xmm1 213; SSE41-NEXT: psraw $1, %xmm0 214; SSE41-NEXT: paddw %xmm1, %xmm0 215; SSE41-NEXT: retq 216; 217; SSE-LABEL: test5: 218; SSE: # BB#0: 219; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0 220; SSE-NEXT: movdqa %xmm0, %xmm1 221; SSE-NEXT: psrlw $15, %xmm1 222; SSE-NEXT: psraw $1, %xmm0 223; SSE-NEXT: paddw %xmm1, %xmm0 224; SSE-NEXT: retq 225; 226; AVX-LABEL: test5: 227; AVX: # BB#0: 228; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 229; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 230; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 231; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 232; AVX-NEXT: retq 233 %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 234 ret <8 x i16> %div 235} 236 237define <16 x i16> @test6(<16 x i16> %a) #0 { 238; SSE41-LABEL: test6: 239; SSE41: # BB#0: 240; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] 241; SSE41-NEXT: pmulhw %xmm2, %xmm0 242; SSE41-NEXT: movdqa %xmm0, %xmm3 243; SSE41-NEXT: psrlw $15, %xmm3 244; SSE41-NEXT: psraw $1, %xmm0 245; SSE41-NEXT: paddw %xmm3, %xmm0 246; SSE41-NEXT: pmulhw %xmm2, %xmm1 247; SSE41-NEXT: movdqa %xmm1, %xmm2 248; SSE41-NEXT: psrlw $15, %xmm2 249; SSE41-NEXT: psraw $1, %xmm1 250; SSE41-NEXT: paddw %xmm2, %xmm1 251; SSE41-NEXT: retq 252; 253; SSE-LABEL: test6: 254; SSE: # BB#0: 255; SSE-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] 256; SSE-NEXT: pmulhw %xmm2, %xmm0 257; SSE-NEXT: movdqa %xmm0, %xmm3 258; SSE-NEXT: psrlw $15, %xmm3 259; SSE-NEXT: psraw $1, %xmm0 260; SSE-NEXT: paddw %xmm3, %xmm0 261; SSE-NEXT: pmulhw %xmm2, %xmm1 262; SSE-NEXT: movdqa %xmm1, %xmm2 263; SSE-NEXT: psrlw $15, %xmm2 264; SSE-NEXT: psraw $1, %xmm1 265; SSE-NEXT: paddw %xmm2, %xmm1 266; SSE-NEXT: retq 267; 268; AVX-LABEL: test6: 269; AVX: # BB#0: 270; AVX-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0 271; AVX-NEXT: vpsrlw $15, %ymm0, %ymm1 272; AVX-NEXT: vpsraw $1, %ymm0, %ymm0 273; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 274; AVX-NEXT: retq 275 %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7> 276 ret <16 x i16> %div 277} 278 279define <16 x i8> @test7(<16 x i8> %a) #0 { 280; SSE41-LABEL: test7: 281; SSE41: # BB#0: 282; SSE41-NEXT: pextrb $1, %xmm0, %eax 283; SSE41-NEXT: movsbl %al, %eax 284; SSE41-NEXT: imull $-109, %eax, %ecx 285; SSE41-NEXT: shrl $8, %ecx 286; SSE41-NEXT: addb %cl, %al 287; SSE41-NEXT: movb %al, %cl 288; SSE41-NEXT: shrb $7, %cl 289; SSE41-NEXT: sarb $2, %al 290; SSE41-NEXT: addb %cl, %al 291; SSE41-NEXT: movzbl %al, %eax 292; SSE41-NEXT: pextrb $0, %xmm0, %ecx 293; SSE41-NEXT: movsbl %cl, %ecx 294; SSE41-NEXT: imull $-109, %ecx, %edx 295; SSE41-NEXT: shrl $8, %edx 296; SSE41-NEXT: addb %dl, %cl 297; SSE41-NEXT: movb %cl, %dl 298; SSE41-NEXT: shrb $7, %dl 299; SSE41-NEXT: sarb $2, %cl 300; SSE41-NEXT: addb %dl, %cl 301; SSE41-NEXT: movzbl %cl, %ecx 302; SSE41-NEXT: movd %ecx, %xmm1 303; SSE41-NEXT: pinsrb $1, %eax, %xmm1 304; SSE41-NEXT: pextrb $2, %xmm0, %eax 305; SSE41-NEXT: movsbl %al, %eax 306; SSE41-NEXT: imull $-109, %eax, %ecx 307; SSE41-NEXT: shrl $8, %ecx 308; SSE41-NEXT: addb %cl, %al 309; SSE41-NEXT: movb %al, %cl 310; SSE41-NEXT: shrb $7, %cl 311; SSE41-NEXT: sarb $2, %al 312; SSE41-NEXT: addb %cl, %al 313; SSE41-NEXT: movzbl %al, %eax 314; SSE41-NEXT: pinsrb $2, %eax, %xmm1 315; SSE41-NEXT: pextrb $3, %xmm0, %eax 316; SSE41-NEXT: movsbl %al, %eax 317; SSE41-NEXT: imull $-109, %eax, %ecx 318; SSE41-NEXT: shrl $8, %ecx 319; SSE41-NEXT: addb %cl, %al 320; SSE41-NEXT: movb %al, %cl 321; SSE41-NEXT: shrb $7, %cl 322; SSE41-NEXT: sarb $2, %al 323; SSE41-NEXT: addb %cl, %al 324; SSE41-NEXT: movzbl %al, %eax 325; SSE41-NEXT: pinsrb $3, %eax, %xmm1 326; SSE41-NEXT: pextrb $4, %xmm0, %eax 327; SSE41-NEXT: movsbl %al, %eax 328; SSE41-NEXT: imull $-109, %eax, %ecx 329; SSE41-NEXT: shrl $8, %ecx 330; SSE41-NEXT: addb %cl, %al 331; SSE41-NEXT: movb %al, %cl 332; SSE41-NEXT: shrb $7, %cl 333; SSE41-NEXT: sarb $2, %al 334; SSE41-NEXT: addb %cl, %al 335; SSE41-NEXT: movzbl %al, %eax 336; SSE41-NEXT: pinsrb $4, %eax, %xmm1 337; SSE41-NEXT: pextrb $5, %xmm0, %eax 338; SSE41-NEXT: movsbl %al, %eax 339; SSE41-NEXT: imull $-109, %eax, %ecx 340; SSE41-NEXT: shrl $8, %ecx 341; SSE41-NEXT: addb %cl, %al 342; SSE41-NEXT: movb %al, %cl 343; SSE41-NEXT: shrb $7, %cl 344; SSE41-NEXT: sarb $2, %al 345; SSE41-NEXT: addb %cl, %al 346; SSE41-NEXT: movzbl %al, %eax 347; SSE41-NEXT: pinsrb $5, %eax, %xmm1 348; SSE41-NEXT: pextrb $6, %xmm0, %eax 349; SSE41-NEXT: movsbl %al, %eax 350; SSE41-NEXT: imull $-109, %eax, %ecx 351; SSE41-NEXT: shrl $8, %ecx 352; SSE41-NEXT: addb %cl, %al 353; SSE41-NEXT: movb %al, %cl 354; SSE41-NEXT: shrb $7, %cl 355; SSE41-NEXT: sarb $2, %al 356; SSE41-NEXT: addb %cl, %al 357; SSE41-NEXT: movzbl %al, %eax 358; SSE41-NEXT: pinsrb $6, %eax, %xmm1 359; SSE41-NEXT: pextrb $7, %xmm0, %eax 360; SSE41-NEXT: movsbl %al, %eax 361; SSE41-NEXT: imull $-109, %eax, %ecx 362; SSE41-NEXT: shrl $8, %ecx 363; SSE41-NEXT: addb %cl, %al 364; SSE41-NEXT: movb %al, %cl 365; SSE41-NEXT: shrb $7, %cl 366; SSE41-NEXT: sarb $2, %al 367; SSE41-NEXT: addb %cl, %al 368; SSE41-NEXT: movzbl %al, %eax 369; SSE41-NEXT: pinsrb $7, %eax, %xmm1 370; SSE41-NEXT: pextrb $8, %xmm0, %eax 371; SSE41-NEXT: movsbl %al, %eax 372; SSE41-NEXT: imull $-109, %eax, %ecx 373; SSE41-NEXT: shrl $8, %ecx 374; SSE41-NEXT: addb %cl, %al 375; SSE41-NEXT: movb %al, %cl 376; SSE41-NEXT: shrb $7, %cl 377; SSE41-NEXT: sarb $2, %al 378; SSE41-NEXT: addb %cl, %al 379; SSE41-NEXT: movzbl %al, %eax 380; SSE41-NEXT: pinsrb $8, %eax, %xmm1 381; SSE41-NEXT: pextrb $9, %xmm0, %eax 382; SSE41-NEXT: movsbl %al, %eax 383; SSE41-NEXT: imull $-109, %eax, %ecx 384; SSE41-NEXT: shrl $8, %ecx 385; SSE41-NEXT: addb %cl, %al 386; SSE41-NEXT: movb %al, %cl 387; SSE41-NEXT: shrb $7, %cl 388; SSE41-NEXT: sarb $2, %al 389; SSE41-NEXT: addb %cl, %al 390; SSE41-NEXT: movzbl %al, %eax 391; SSE41-NEXT: pinsrb $9, %eax, %xmm1 392; SSE41-NEXT: pextrb $10, %xmm0, %eax 393; SSE41-NEXT: movsbl %al, %eax 394; SSE41-NEXT: imull $-109, %eax, %ecx 395; SSE41-NEXT: shrl $8, %ecx 396; SSE41-NEXT: addb %cl, %al 397; SSE41-NEXT: movb %al, %cl 398; SSE41-NEXT: shrb $7, %cl 399; SSE41-NEXT: sarb $2, %al 400; SSE41-NEXT: addb %cl, %al 401; SSE41-NEXT: movzbl %al, %eax 402; SSE41-NEXT: pinsrb $10, %eax, %xmm1 403; SSE41-NEXT: pextrb $11, %xmm0, %eax 404; SSE41-NEXT: movsbl %al, %eax 405; SSE41-NEXT: imull $-109, %eax, %ecx 406; SSE41-NEXT: shrl $8, %ecx 407; SSE41-NEXT: addb %cl, %al 408; SSE41-NEXT: movb %al, %cl 409; SSE41-NEXT: shrb $7, %cl 410; SSE41-NEXT: sarb $2, %al 411; SSE41-NEXT: addb %cl, %al 412; SSE41-NEXT: movzbl %al, %eax 413; SSE41-NEXT: pinsrb $11, %eax, %xmm1 414; SSE41-NEXT: pextrb $12, %xmm0, %eax 415; SSE41-NEXT: movsbl %al, %eax 416; SSE41-NEXT: imull $-109, %eax, %ecx 417; SSE41-NEXT: shrl $8, %ecx 418; SSE41-NEXT: addb %cl, %al 419; SSE41-NEXT: movb %al, %cl 420; SSE41-NEXT: shrb $7, %cl 421; SSE41-NEXT: sarb $2, %al 422; SSE41-NEXT: addb %cl, %al 423; SSE41-NEXT: movzbl %al, %eax 424; SSE41-NEXT: pinsrb $12, %eax, %xmm1 425; SSE41-NEXT: pextrb $13, %xmm0, %eax 426; SSE41-NEXT: movsbl %al, %eax 427; SSE41-NEXT: imull $-109, %eax, %ecx 428; SSE41-NEXT: shrl $8, %ecx 429; SSE41-NEXT: addb %cl, %al 430; SSE41-NEXT: movb %al, %cl 431; SSE41-NEXT: shrb $7, %cl 432; SSE41-NEXT: sarb $2, %al 433; SSE41-NEXT: addb %cl, %al 434; SSE41-NEXT: movzbl %al, %eax 435; SSE41-NEXT: pinsrb $13, %eax, %xmm1 436; SSE41-NEXT: pextrb $14, %xmm0, %eax 437; SSE41-NEXT: movsbl %al, %eax 438; SSE41-NEXT: imull $-109, %eax, %ecx 439; SSE41-NEXT: shrl $8, %ecx 440; SSE41-NEXT: addb %cl, %al 441; SSE41-NEXT: movb %al, %cl 442; SSE41-NEXT: shrb $7, %cl 443; SSE41-NEXT: sarb $2, %al 444; SSE41-NEXT: addb %cl, %al 445; SSE41-NEXT: movzbl %al, %eax 446; SSE41-NEXT: pinsrb $14, %eax, %xmm1 447; SSE41-NEXT: pextrb $15, %xmm0, %eax 448; SSE41-NEXT: movsbl %al, %eax 449; SSE41-NEXT: imull $-109, %eax, %ecx 450; SSE41-NEXT: shrl $8, %ecx 451; SSE41-NEXT: addb %cl, %al 452; SSE41-NEXT: movb %al, %cl 453; SSE41-NEXT: shrb $7, %cl 454; SSE41-NEXT: sarb $2, %al 455; SSE41-NEXT: addb %cl, %al 456; SSE41-NEXT: movzbl %al, %eax 457; SSE41-NEXT: pinsrb $15, %eax, %xmm1 458; SSE41-NEXT: movdqa %xmm1, %xmm0 459; SSE41-NEXT: retq 460; 461; SSE-LABEL: test7: 462; SSE: # BB#0: 463; SSE-NEXT: pushq %rbp 464; SSE-NEXT: pushq %r14 465; SSE-NEXT: pushq %rbx 466; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 467; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 468; SSE-NEXT: imull $-109, %eax, %ecx 469; SSE-NEXT: shrl $8, %ecx 470; SSE-NEXT: addb %al, %cl 471; SSE-NEXT: movb %cl, %al 472; SSE-NEXT: shrb $7, %al 473; SSE-NEXT: sarb $2, %cl 474; SSE-NEXT: addb %al, %cl 475; SSE-NEXT: movzbl %cl, %eax 476; SSE-NEXT: movd %eax, %xmm0 477; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r14d 478; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edx 479; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r9d 480; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 481; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r11d 482; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx 483; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r8d 484; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi 485; SSE-NEXT: imull $-109, %esi, %edi 486; SSE-NEXT: shrl $8, %edi 487; SSE-NEXT: addb %sil, %dil 488; SSE-NEXT: movb %dil, %bl 489; SSE-NEXT: shrb $7, %bl 490; SSE-NEXT: sarb $2, %dil 491; SSE-NEXT: addb %bl, %dil 492; SSE-NEXT: movzbl %dil, %esi 493; SSE-NEXT: movd %esi, %xmm1 494; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 495; SSE-NEXT: imull $-109, %eax, %esi 496; SSE-NEXT: shrl $8, %esi 497; SSE-NEXT: addb %al, %sil 498; SSE-NEXT: movb %sil, %al 499; SSE-NEXT: shrb $7, %al 500; SSE-NEXT: sarb $2, %sil 501; SSE-NEXT: addb %al, %sil 502; SSE-NEXT: movzbl %sil, %eax 503; SSE-NEXT: movd %eax, %xmm2 504; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ebp 505; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi 506; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r10d 507; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edi 508; SSE-NEXT: imull $-109, %edi, %ebx 509; SSE-NEXT: shrl $8, %ebx 510; SSE-NEXT: addb %dil, %bl 511; SSE-NEXT: movb %bl, %al 512; SSE-NEXT: shrb $7, %al 513; SSE-NEXT: sarb $2, %bl 514; SSE-NEXT: addb %al, %bl 515; SSE-NEXT: movzbl %bl, %eax 516; SSE-NEXT: movd %eax, %xmm0 517; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 518; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 519; SSE-NEXT: imull $-109, %edx, %eax 520; SSE-NEXT: shrl $8, %eax 521; SSE-NEXT: addb %dl, %al 522; SSE-NEXT: movb %al, %dl 523; SSE-NEXT: shrb $7, %dl 524; SSE-NEXT: sarb $2, %al 525; SSE-NEXT: addb %dl, %al 526; SSE-NEXT: movzbl %al, %eax 527; SSE-NEXT: movd %eax, %xmm1 528; SSE-NEXT: imull $-109, %esi, %eax 529; SSE-NEXT: shrl $8, %eax 530; SSE-NEXT: addb %sil, %al 531; SSE-NEXT: movb %al, %dl 532; SSE-NEXT: shrb $7, %dl 533; SSE-NEXT: sarb $2, %al 534; SSE-NEXT: addb %dl, %al 535; SSE-NEXT: movzbl %al, %eax 536; SSE-NEXT: movd %eax, %xmm2 537; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 538; SSE-NEXT: imull $-109, %ecx, %eax 539; SSE-NEXT: shrl $8, %eax 540; SSE-NEXT: addb %cl, %al 541; SSE-NEXT: movb %al, %cl 542; SSE-NEXT: shrb $7, %cl 543; SSE-NEXT: sarb $2, %al 544; SSE-NEXT: addb %cl, %al 545; SSE-NEXT: movzbl %al, %eax 546; SSE-NEXT: movd %eax, %xmm3 547; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx 548; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 549; SSE-NEXT: imull $-109, %eax, %edx 550; SSE-NEXT: shrl $8, %edx 551; SSE-NEXT: addb %al, %dl 552; SSE-NEXT: movb %dl, %al 553; SSE-NEXT: shrb $7, %al 554; SSE-NEXT: sarb $2, %dl 555; SSE-NEXT: addb %al, %dl 556; SSE-NEXT: movzbl %dl, %eax 557; SSE-NEXT: movd %eax, %xmm1 558; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 559; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 560; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 561; SSE-NEXT: imull $-109, %r14d, %eax 562; SSE-NEXT: shrl $8, %eax 563; SSE-NEXT: addb %r14b, %al 564; SSE-NEXT: movb %al, %dl 565; SSE-NEXT: shrb $7, %dl 566; SSE-NEXT: sarb $2, %al 567; SSE-NEXT: addb %dl, %al 568; SSE-NEXT: movzbl %al, %eax 569; SSE-NEXT: movd %eax, %xmm2 570; SSE-NEXT: imull $-109, %ebp, %eax 571; SSE-NEXT: shrl $8, %eax 572; SSE-NEXT: addb %bpl, %al 573; SSE-NEXT: movb %al, %dl 574; SSE-NEXT: shrb $7, %dl 575; SSE-NEXT: sarb $2, %al 576; SSE-NEXT: addb %dl, %al 577; SSE-NEXT: movzbl %al, %eax 578; SSE-NEXT: movd %eax, %xmm0 579; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 580; SSE-NEXT: imull $-109, %r11d, %eax 581; SSE-NEXT: shrl $8, %eax 582; SSE-NEXT: addb %r11b, %al 583; SSE-NEXT: movb %al, %dl 584; SSE-NEXT: shrb $7, %dl 585; SSE-NEXT: sarb $2, %al 586; SSE-NEXT: addb %dl, %al 587; SSE-NEXT: movzbl %al, %eax 588; SSE-NEXT: movd %eax, %xmm3 589; SSE-NEXT: imull $-109, %ecx, %eax 590; SSE-NEXT: shrl $8, %eax 591; SSE-NEXT: addb %cl, %al 592; SSE-NEXT: movb %al, %cl 593; SSE-NEXT: shrb $7, %cl 594; SSE-NEXT: sarb $2, %al 595; SSE-NEXT: addb %cl, %al 596; SSE-NEXT: movzbl %al, %eax 597; SSE-NEXT: movd %eax, %xmm2 598; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 599; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 600; SSE-NEXT: imull $-109, %r9d, %eax 601; SSE-NEXT: shrl $8, %eax 602; SSE-NEXT: addb %r9b, %al 603; SSE-NEXT: movb %al, %cl 604; SSE-NEXT: shrb $7, %cl 605; SSE-NEXT: sarb $2, %al 606; SSE-NEXT: addb %cl, %al 607; SSE-NEXT: movzbl %al, %eax 608; SSE-NEXT: movd %eax, %xmm0 609; SSE-NEXT: imull $-109, %r10d, %eax 610; SSE-NEXT: shrl $8, %eax 611; SSE-NEXT: addb %r10b, %al 612; SSE-NEXT: movb %al, %cl 613; SSE-NEXT: shrb $7, %cl 614; SSE-NEXT: sarb $2, %al 615; SSE-NEXT: addb %cl, %al 616; SSE-NEXT: movzbl %al, %eax 617; SSE-NEXT: movd %eax, %xmm3 618; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 619; SSE-NEXT: imull $-109, %r8d, %eax 620; SSE-NEXT: shrl $8, %eax 621; SSE-NEXT: addb %r8b, %al 622; SSE-NEXT: movb %al, %cl 623; SSE-NEXT: shrb $7, %cl 624; SSE-NEXT: sarb $2, %al 625; SSE-NEXT: addb %cl, %al 626; SSE-NEXT: movzbl %al, %eax 627; SSE-NEXT: movd %eax, %xmm4 628; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 629; SSE-NEXT: imull $-109, %eax, %ecx 630; SSE-NEXT: shrl $8, %ecx 631; SSE-NEXT: addb %al, %cl 632; SSE-NEXT: movb %cl, %al 633; SSE-NEXT: shrb $7, %al 634; SSE-NEXT: sarb $2, %cl 635; SSE-NEXT: addb %al, %cl 636; SSE-NEXT: movzbl %cl, %eax 637; SSE-NEXT: movd %eax, %xmm0 638; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 639; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 640; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 641; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 642; SSE-NEXT: popq %rbx 643; SSE-NEXT: popq %r14 644; SSE-NEXT: popq %rbp 645; SSE-NEXT: retq 646; 647; AVX-LABEL: test7: 648; AVX: # BB#0: 649; AVX-NEXT: vpextrb $1, %xmm0, %eax 650; AVX-NEXT: movsbl %al, %eax 651; AVX-NEXT: imull $-109, %eax, %ecx 652; AVX-NEXT: shrl $8, %ecx 653; AVX-NEXT: addb %cl, %al 654; AVX-NEXT: movb %al, %cl 655; AVX-NEXT: shrb $7, %cl 656; AVX-NEXT: sarb $2, %al 657; AVX-NEXT: addb %cl, %al 658; AVX-NEXT: movzbl %al, %eax 659; AVX-NEXT: vpextrb $0, %xmm0, %ecx 660; AVX-NEXT: movsbl %cl, %ecx 661; AVX-NEXT: imull $-109, %ecx, %edx 662; AVX-NEXT: shrl $8, %edx 663; AVX-NEXT: addb %dl, %cl 664; AVX-NEXT: movb %cl, %dl 665; AVX-NEXT: shrb $7, %dl 666; AVX-NEXT: sarb $2, %cl 667; AVX-NEXT: addb %dl, %cl 668; AVX-NEXT: movzbl %cl, %ecx 669; AVX-NEXT: vmovd %ecx, %xmm1 670; AVX-NEXT: vpextrb $2, %xmm0, %ecx 671; AVX-NEXT: movsbl %cl, %ecx 672; AVX-NEXT: imull $-109, %ecx, %edx 673; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 674; AVX-NEXT: shrl $8, %edx 675; AVX-NEXT: addb %dl, %cl 676; AVX-NEXT: movb %cl, %al 677; AVX-NEXT: shrb $7, %al 678; AVX-NEXT: sarb $2, %cl 679; AVX-NEXT: addb %al, %cl 680; AVX-NEXT: movzbl %cl, %eax 681; AVX-NEXT: vpextrb $3, %xmm0, %ecx 682; AVX-NEXT: movsbl %cl, %ecx 683; AVX-NEXT: imull $-109, %ecx, %edx 684; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 685; AVX-NEXT: shrl $8, %edx 686; AVX-NEXT: addb %dl, %cl 687; AVX-NEXT: movb %cl, %al 688; AVX-NEXT: shrb $7, %al 689; AVX-NEXT: sarb $2, %cl 690; AVX-NEXT: addb %al, %cl 691; AVX-NEXT: movzbl %cl, %eax 692; AVX-NEXT: vpextrb $4, %xmm0, %ecx 693; AVX-NEXT: movsbl %cl, %ecx 694; AVX-NEXT: imull $-109, %ecx, %edx 695; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 696; AVX-NEXT: shrl $8, %edx 697; AVX-NEXT: addb %dl, %cl 698; AVX-NEXT: movb %cl, %al 699; AVX-NEXT: shrb $7, %al 700; AVX-NEXT: sarb $2, %cl 701; AVX-NEXT: addb %al, %cl 702; AVX-NEXT: movzbl %cl, %eax 703; AVX-NEXT: vpextrb $5, %xmm0, %ecx 704; AVX-NEXT: movsbl %cl, %ecx 705; AVX-NEXT: imull $-109, %ecx, %edx 706; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 707; AVX-NEXT: shrl $8, %edx 708; AVX-NEXT: addb %dl, %cl 709; AVX-NEXT: movb %cl, %al 710; AVX-NEXT: shrb $7, %al 711; AVX-NEXT: sarb $2, %cl 712; AVX-NEXT: addb %al, %cl 713; AVX-NEXT: movzbl %cl, %eax 714; AVX-NEXT: vpextrb $6, %xmm0, %ecx 715; AVX-NEXT: movsbl %cl, %ecx 716; AVX-NEXT: imull $-109, %ecx, %edx 717; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 718; AVX-NEXT: shrl $8, %edx 719; AVX-NEXT: addb %dl, %cl 720; AVX-NEXT: movb %cl, %al 721; AVX-NEXT: shrb $7, %al 722; AVX-NEXT: sarb $2, %cl 723; AVX-NEXT: addb %al, %cl 724; AVX-NEXT: movzbl %cl, %eax 725; AVX-NEXT: vpextrb $7, %xmm0, %ecx 726; AVX-NEXT: movsbl %cl, %ecx 727; AVX-NEXT: imull $-109, %ecx, %edx 728; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 729; AVX-NEXT: shrl $8, %edx 730; AVX-NEXT: addb %dl, %cl 731; AVX-NEXT: movb %cl, %al 732; AVX-NEXT: shrb $7, %al 733; AVX-NEXT: sarb $2, %cl 734; AVX-NEXT: addb %al, %cl 735; AVX-NEXT: movzbl %cl, %eax 736; AVX-NEXT: vpextrb $8, %xmm0, %ecx 737; AVX-NEXT: movsbl %cl, %ecx 738; AVX-NEXT: imull $-109, %ecx, %edx 739; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 740; AVX-NEXT: shrl $8, %edx 741; AVX-NEXT: addb %dl, %cl 742; AVX-NEXT: movb %cl, %al 743; AVX-NEXT: shrb $7, %al 744; AVX-NEXT: sarb $2, %cl 745; AVX-NEXT: addb %al, %cl 746; AVX-NEXT: movzbl %cl, %eax 747; AVX-NEXT: vpextrb $9, %xmm0, %ecx 748; AVX-NEXT: movsbl %cl, %ecx 749; AVX-NEXT: imull $-109, %ecx, %edx 750; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 751; AVX-NEXT: shrl $8, %edx 752; AVX-NEXT: addb %dl, %cl 753; AVX-NEXT: movb %cl, %al 754; AVX-NEXT: shrb $7, %al 755; AVX-NEXT: sarb $2, %cl 756; AVX-NEXT: addb %al, %cl 757; AVX-NEXT: movzbl %cl, %eax 758; AVX-NEXT: vpextrb $10, %xmm0, %ecx 759; AVX-NEXT: movsbl %cl, %ecx 760; AVX-NEXT: imull $-109, %ecx, %edx 761; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 762; AVX-NEXT: shrl $8, %edx 763; AVX-NEXT: addb %dl, %cl 764; AVX-NEXT: movb %cl, %al 765; AVX-NEXT: shrb $7, %al 766; AVX-NEXT: sarb $2, %cl 767; AVX-NEXT: addb %al, %cl 768; AVX-NEXT: movzbl %cl, %eax 769; AVX-NEXT: vpextrb $11, %xmm0, %ecx 770; AVX-NEXT: movsbl %cl, %ecx 771; AVX-NEXT: imull $-109, %ecx, %edx 772; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 773; AVX-NEXT: shrl $8, %edx 774; AVX-NEXT: addb %dl, %cl 775; AVX-NEXT: movb %cl, %al 776; AVX-NEXT: shrb $7, %al 777; AVX-NEXT: sarb $2, %cl 778; AVX-NEXT: addb %al, %cl 779; AVX-NEXT: movzbl %cl, %eax 780; AVX-NEXT: vpextrb $12, %xmm0, %ecx 781; AVX-NEXT: movsbl %cl, %ecx 782; AVX-NEXT: imull $-109, %ecx, %edx 783; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 784; AVX-NEXT: shrl $8, %edx 785; AVX-NEXT: addb %dl, %cl 786; AVX-NEXT: movb %cl, %al 787; AVX-NEXT: shrb $7, %al 788; AVX-NEXT: sarb $2, %cl 789; AVX-NEXT: addb %al, %cl 790; AVX-NEXT: movzbl %cl, %eax 791; AVX-NEXT: vpextrb $13, %xmm0, %ecx 792; AVX-NEXT: movsbl %cl, %ecx 793; AVX-NEXT: imull $-109, %ecx, %edx 794; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 795; AVX-NEXT: shrl $8, %edx 796; AVX-NEXT: addb %dl, %cl 797; AVX-NEXT: movb %cl, %al 798; AVX-NEXT: shrb $7, %al 799; AVX-NEXT: sarb $2, %cl 800; AVX-NEXT: addb %al, %cl 801; AVX-NEXT: movzbl %cl, %eax 802; AVX-NEXT: vpextrb $14, %xmm0, %ecx 803; AVX-NEXT: movsbl %cl, %ecx 804; AVX-NEXT: imull $-109, %ecx, %edx 805; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 806; AVX-NEXT: shrl $8, %edx 807; AVX-NEXT: addb %dl, %cl 808; AVX-NEXT: movb %cl, %al 809; AVX-NEXT: shrb $7, %al 810; AVX-NEXT: sarb $2, %cl 811; AVX-NEXT: addb %al, %cl 812; AVX-NEXT: movzbl %cl, %eax 813; AVX-NEXT: vpextrb $15, %xmm0, %ecx 814; AVX-NEXT: movsbl %cl, %ecx 815; AVX-NEXT: imull $-109, %ecx, %edx 816; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0 817; AVX-NEXT: shrl $8, %edx 818; AVX-NEXT: addb %dl, %cl 819; AVX-NEXT: movb %cl, %al 820; AVX-NEXT: shrb $7, %al 821; AVX-NEXT: sarb $2, %cl 822; AVX-NEXT: addb %al, %cl 823; AVX-NEXT: movzbl %cl, %eax 824; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 825; AVX-NEXT: retq 826 %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 827 ret <16 x i8> %div 828} 829 830define <4 x i32> @test8(<4 x i32> %a) #0 { 831; SSE41-LABEL: test8: 832; SSE41: # BB#0: 833; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 834; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 835; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 836; SSE41-NEXT: pmuldq %xmm2, %xmm3 837; SSE41-NEXT: pmuldq %xmm0, %xmm1 838; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 839; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 840; SSE41-NEXT: paddd %xmm0, %xmm1 841; SSE41-NEXT: movdqa %xmm1, %xmm0 842; SSE41-NEXT: psrld $31, %xmm0 843; SSE41-NEXT: psrad $2, %xmm1 844; SSE41-NEXT: paddd %xmm0, %xmm1 845; SSE41-NEXT: movdqa %xmm1, %xmm0 846; SSE41-NEXT: retq 847; 848; SSE-LABEL: test8: 849; SSE: # BB#0: 850; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 851; SSE-NEXT: movdqa %xmm0, %xmm2 852; SSE-NEXT: psrad $31, %xmm2 853; SSE-NEXT: pand %xmm1, %xmm2 854; SSE-NEXT: movdqa %xmm0, %xmm3 855; SSE-NEXT: pmuludq %xmm1, %xmm3 856; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 857; SSE-NEXT: psrad $31, %xmm1 858; SSE-NEXT: pand %xmm0, %xmm1 859; SSE-NEXT: paddd %xmm1, %xmm2 860; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 861; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 862; SSE-NEXT: pmuludq %xmm4, %xmm3 863; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 864; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 865; SSE-NEXT: psubd %xmm2, %xmm1 866; SSE-NEXT: paddd %xmm0, %xmm1 867; SSE-NEXT: movdqa %xmm1, %xmm0 868; SSE-NEXT: psrld $31, %xmm0 869; SSE-NEXT: psrad $2, %xmm1 870; SSE-NEXT: paddd %xmm0, %xmm1 871; SSE-NEXT: movdqa %xmm1, %xmm0 872; SSE-NEXT: retq 873; 874; AVX-LABEL: test8: 875; AVX: # BB#0: 876; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 877; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 878; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 879; AVX-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 880; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 881; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 882; AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 883; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 884; AVX-NEXT: vpsrld $31, %xmm0, %xmm1 885; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 886; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 887; AVX-NEXT: retq 888 %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 889 ret <4 x i32> %div 890} 891 892define <8 x i32> @test9(<8 x i32> %a) #0 { 893; SSE41-LABEL: test9: 894; SSE41: # BB#0: 895; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] 896; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 897; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 898; SSE41-NEXT: pmuldq %xmm4, %xmm5 899; SSE41-NEXT: movdqa %xmm0, %xmm2 900; SSE41-NEXT: pmuldq %xmm3, %xmm2 901; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 902; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] 903; SSE41-NEXT: paddd %xmm0, %xmm2 904; SSE41-NEXT: movdqa %xmm2, %xmm0 905; SSE41-NEXT: psrld $31, %xmm0 906; SSE41-NEXT: psrad $2, %xmm2 907; SSE41-NEXT: paddd %xmm0, %xmm2 908; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] 909; SSE41-NEXT: pmuldq %xmm4, %xmm0 910; SSE41-NEXT: pmuldq %xmm1, %xmm3 911; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 912; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] 913; SSE41-NEXT: paddd %xmm1, %xmm3 914; SSE41-NEXT: movdqa %xmm3, %xmm0 915; SSE41-NEXT: psrld $31, %xmm0 916; SSE41-NEXT: psrad $2, %xmm3 917; SSE41-NEXT: paddd %xmm0, %xmm3 918; SSE41-NEXT: movdqa %xmm2, %xmm0 919; SSE41-NEXT: movdqa %xmm3, %xmm1 920; SSE41-NEXT: retq 921; 922; SSE-LABEL: test9: 923; SSE: # BB#0: 924; SSE-NEXT: movdqa %xmm0, %xmm2 925; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] 926; SSE-NEXT: movdqa %xmm3, %xmm4 927; SSE-NEXT: psrad $31, %xmm4 928; SSE-NEXT: movdqa %xmm4, %xmm0 929; SSE-NEXT: pand %xmm2, %xmm0 930; SSE-NEXT: movdqa %xmm2, %xmm5 931; SSE-NEXT: psrad $31, %xmm5 932; SSE-NEXT: pand %xmm3, %xmm5 933; SSE-NEXT: paddd %xmm0, %xmm5 934; SSE-NEXT: movdqa %xmm2, %xmm0 935; SSE-NEXT: pmuludq %xmm3, %xmm0 936; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 937; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] 938; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 939; SSE-NEXT: pmuludq %xmm6, %xmm7 940; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] 941; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 942; SSE-NEXT: psubd %xmm5, %xmm0 943; SSE-NEXT: paddd %xmm2, %xmm0 944; SSE-NEXT: movdqa %xmm0, %xmm2 945; SSE-NEXT: psrld $31, %xmm2 946; SSE-NEXT: psrad $2, %xmm0 947; SSE-NEXT: paddd %xmm2, %xmm0 948; SSE-NEXT: pand %xmm1, %xmm4 949; SSE-NEXT: movdqa %xmm1, %xmm5 950; SSE-NEXT: psrad $31, %xmm5 951; SSE-NEXT: pand %xmm3, %xmm5 952; SSE-NEXT: paddd %xmm4, %xmm5 953; SSE-NEXT: pmuludq %xmm1, %xmm3 954; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 955; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 956; SSE-NEXT: pmuludq %xmm6, %xmm3 957; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 958; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 959; SSE-NEXT: psubd %xmm5, %xmm2 960; SSE-NEXT: paddd %xmm1, %xmm2 961; SSE-NEXT: movdqa %xmm2, %xmm1 962; SSE-NEXT: psrld $31, %xmm1 963; SSE-NEXT: psrad $2, %xmm2 964; SSE-NEXT: paddd %xmm1, %xmm2 965; SSE-NEXT: movdqa %xmm2, %xmm1 966; SSE-NEXT: retq 967; 968; AVX-LABEL: test9: 969; AVX: # BB#0: 970; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 971; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 972; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 973; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 974; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 975; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 976; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 977; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 978; AVX-NEXT: vpsrld $31, %ymm0, %ymm1 979; AVX-NEXT: vpsrad $2, %ymm0, %ymm0 980; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 981; AVX-NEXT: retq 982 %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> 983 ret <8 x i32> %div 984} 985 986define <8 x i32> @test10(<8 x i32> %a) #0 { 987; SSE41-LABEL: test10: 988; SSE41: # BB#0: 989; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] 990; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 991; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 992; SSE41-NEXT: pmuludq %xmm3, %xmm4 993; SSE41-NEXT: movdqa %xmm0, %xmm5 994; SSE41-NEXT: pmuludq %xmm2, %xmm5 995; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 996; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] 997; SSE41-NEXT: movdqa %xmm0, %xmm4 998; SSE41-NEXT: psubd %xmm5, %xmm4 999; SSE41-NEXT: psrld $1, %xmm4 1000; SSE41-NEXT: paddd %xmm5, %xmm4 1001; SSE41-NEXT: psrld $2, %xmm4 1002; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7] 1003; SSE41-NEXT: pmulld %xmm5, %xmm4 1004; SSE41-NEXT: psubd %xmm4, %xmm0 1005; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1006; SSE41-NEXT: pmuludq %xmm3, %xmm4 1007; SSE41-NEXT: pmuludq %xmm1, %xmm2 1008; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1009; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1010; SSE41-NEXT: movdqa %xmm1, %xmm3 1011; SSE41-NEXT: psubd %xmm2, %xmm3 1012; SSE41-NEXT: psrld $1, %xmm3 1013; SSE41-NEXT: paddd %xmm2, %xmm3 1014; SSE41-NEXT: psrld $2, %xmm3 1015; SSE41-NEXT: pmulld %xmm5, %xmm3 1016; SSE41-NEXT: psubd %xmm3, %xmm1 1017; SSE41-NEXT: retq 1018; 1019; SSE-LABEL: test10: 1020; SSE: # BB#0: 1021; SSE-NEXT: movdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757] 1022; SSE-NEXT: movdqa %xmm0, %xmm2 1023; SSE-NEXT: pmuludq %xmm3, %xmm2 1024; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 1025; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 1026; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 1027; SSE-NEXT: pmuludq %xmm4, %xmm5 1028; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 1029; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 1030; SSE-NEXT: movdqa %xmm0, %xmm5 1031; SSE-NEXT: psubd %xmm2, %xmm5 1032; SSE-NEXT: psrld $1, %xmm5 1033; SSE-NEXT: paddd %xmm2, %xmm5 1034; SSE-NEXT: psrld $2, %xmm5 1035; SSE-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7] 1036; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] 1037; SSE-NEXT: pmuludq %xmm2, %xmm5 1038; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1039; SSE-NEXT: pmuludq %xmm2, %xmm6 1040; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1041; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 1042; SSE-NEXT: psubd %xmm5, %xmm0 1043; SSE-NEXT: pmuludq %xmm1, %xmm3 1044; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 1045; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 1046; SSE-NEXT: pmuludq %xmm4, %xmm5 1047; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3] 1048; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1049; SSE-NEXT: movdqa %xmm1, %xmm4 1050; SSE-NEXT: psubd %xmm3, %xmm4 1051; SSE-NEXT: psrld $1, %xmm4 1052; SSE-NEXT: paddd %xmm3, %xmm4 1053; SSE-NEXT: psrld $2, %xmm4 1054; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 1055; SSE-NEXT: pmuludq %xmm2, %xmm4 1056; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1057; SSE-NEXT: pmuludq %xmm2, %xmm3 1058; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 1059; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 1060; SSE-NEXT: psubd %xmm4, %xmm1 1061; SSE-NEXT: retq 1062; 1063; AVX-LABEL: test10: 1064; AVX: # BB#0: 1065; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 1066; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 1067; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 1068; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 1069; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 1070; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 1071; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 1072; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2 1073; AVX-NEXT: vpsrld $1, %ymm2, %ymm2 1074; AVX-NEXT: vpaddd %ymm1, %ymm2, %ymm1 1075; AVX-NEXT: vpsrld $2, %ymm1, %ymm1 1076; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 1077; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1 1078; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1079; AVX-NEXT: retq 1080 %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> 1081 ret <8 x i32> %rem 1082} 1083 1084define <8 x i32> @test11(<8 x i32> %a) #0 { 1085; SSE41-LABEL: test11: 1086; SSE41: # BB#0: 1087; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 1088; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1089; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1090; SSE41-NEXT: pmuldq %xmm3, %xmm4 1091; SSE41-NEXT: movdqa %xmm0, %xmm5 1092; SSE41-NEXT: pmuldq %xmm2, %xmm5 1093; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1094; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] 1095; SSE41-NEXT: paddd %xmm0, %xmm5 1096; SSE41-NEXT: movdqa %xmm5, %xmm4 1097; SSE41-NEXT: psrld $31, %xmm4 1098; SSE41-NEXT: psrad $2, %xmm5 1099; SSE41-NEXT: paddd %xmm4, %xmm5 1100; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7] 1101; SSE41-NEXT: pmulld %xmm4, %xmm5 1102; SSE41-NEXT: psubd %xmm5, %xmm0 1103; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 1104; SSE41-NEXT: pmuldq %xmm3, %xmm5 1105; SSE41-NEXT: pmuldq %xmm1, %xmm2 1106; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1107; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] 1108; SSE41-NEXT: paddd %xmm1, %xmm2 1109; SSE41-NEXT: movdqa %xmm2, %xmm3 1110; SSE41-NEXT: psrld $31, %xmm3 1111; SSE41-NEXT: psrad $2, %xmm2 1112; SSE41-NEXT: paddd %xmm3, %xmm2 1113; SSE41-NEXT: pmulld %xmm4, %xmm2 1114; SSE41-NEXT: psubd %xmm2, %xmm1 1115; SSE41-NEXT: retq 1116; 1117; SSE-LABEL: test11: 1118; SSE: # BB#0: 1119; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 1120; SSE-NEXT: movdqa %xmm2, %xmm3 1121; SSE-NEXT: psrad $31, %xmm3 1122; SSE-NEXT: movdqa %xmm3, %xmm4 1123; SSE-NEXT: pand %xmm0, %xmm4 1124; SSE-NEXT: movdqa %xmm0, %xmm6 1125; SSE-NEXT: psrad $31, %xmm6 1126; SSE-NEXT: pand %xmm2, %xmm6 1127; SSE-NEXT: paddd %xmm4, %xmm6 1128; SSE-NEXT: movdqa %xmm0, %xmm4 1129; SSE-NEXT: pmuludq %xmm2, %xmm4 1130; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3] 1131; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 1132; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1133; SSE-NEXT: pmuludq %xmm5, %xmm4 1134; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] 1135; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] 1136; SSE-NEXT: psubd %xmm6, %xmm7 1137; SSE-NEXT: paddd %xmm0, %xmm7 1138; SSE-NEXT: movdqa %xmm7, %xmm4 1139; SSE-NEXT: psrld $31, %xmm4 1140; SSE-NEXT: psrad $2, %xmm7 1141; SSE-NEXT: paddd %xmm4, %xmm7 1142; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7] 1143; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] 1144; SSE-NEXT: pmuludq %xmm4, %xmm7 1145; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 1146; SSE-NEXT: pmuludq %xmm4, %xmm6 1147; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1148; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 1149; SSE-NEXT: psubd %xmm7, %xmm0 1150; SSE-NEXT: pand %xmm1, %xmm3 1151; SSE-NEXT: movdqa %xmm1, %xmm6 1152; SSE-NEXT: psrad $31, %xmm6 1153; SSE-NEXT: pand %xmm2, %xmm6 1154; SSE-NEXT: paddd %xmm3, %xmm6 1155; SSE-NEXT: pmuludq %xmm1, %xmm2 1156; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 1157; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 1158; SSE-NEXT: pmuludq %xmm5, %xmm3 1159; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 1160; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1161; SSE-NEXT: psubd %xmm6, %xmm2 1162; SSE-NEXT: paddd %xmm1, %xmm2 1163; SSE-NEXT: movdqa %xmm2, %xmm3 1164; SSE-NEXT: psrld $31, %xmm3 1165; SSE-NEXT: psrad $2, %xmm2 1166; SSE-NEXT: paddd %xmm3, %xmm2 1167; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1168; SSE-NEXT: pmuludq %xmm4, %xmm2 1169; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1170; SSE-NEXT: pmuludq %xmm4, %xmm3 1171; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1172; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1173; SSE-NEXT: psubd %xmm2, %xmm1 1174; SSE-NEXT: retq 1175; 1176; AVX-LABEL: test11: 1177; AVX: # BB#0: 1178; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 1179; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 1180; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 1181; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 1182; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 1183; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 1184; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 1185; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 1186; AVX-NEXT: vpsrld $31, %ymm1, %ymm2 1187; AVX-NEXT: vpsrad $2, %ymm1, %ymm1 1188; AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 1189; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 1190; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1 1191; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1192; AVX-NEXT: retq 1193 %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> 1194 ret <8 x i32> %rem 1195} 1196 1197define <2 x i16> @test12() #0 { 1198; SSE41-LABEL: test12: 1199; SSE41: # BB#0: 1200; SSE41-NEXT: xorps %xmm0, %xmm0 1201; SSE41-NEXT: retq 1202; 1203; SSE-LABEL: test12: 1204; SSE: # BB#0: 1205; SSE-NEXT: xorps %xmm0, %xmm0 1206; SSE-NEXT: retq 1207; 1208; AVX-LABEL: test12: 1209; AVX: # BB#0: 1210; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1211; AVX-NEXT: retq 1212 %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0 1213 %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1 1214 %B9 = urem <2 x i16> %I9, %I9 1215 ret <2 x i16> %B9 1216} 1217 1218define <4 x i32> @PR20355(<4 x i32> %a) #0 { 1219; SSE41-LABEL: PR20355: 1220; SSE41: # BB#0: # %entry 1221; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] 1222; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1223; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1224; SSE41-NEXT: pmuldq %xmm2, %xmm3 1225; SSE41-NEXT: pmuldq %xmm1, %xmm0 1226; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1227; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1228; SSE41-NEXT: movdqa %xmm1, %xmm0 1229; SSE41-NEXT: psrld $31, %xmm0 1230; SSE41-NEXT: paddd %xmm1, %xmm0 1231; SSE41-NEXT: retq 1232; 1233; SSE-LABEL: PR20355: 1234; SSE: # BB#0: # %entry 1235; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] 1236; SSE-NEXT: movdqa %xmm1, %xmm2 1237; SSE-NEXT: psrad $31, %xmm2 1238; SSE-NEXT: pand %xmm0, %xmm2 1239; SSE-NEXT: movdqa %xmm0, %xmm3 1240; SSE-NEXT: psrad $31, %xmm3 1241; SSE-NEXT: pand %xmm1, %xmm3 1242; SSE-NEXT: paddd %xmm2, %xmm3 1243; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1244; SSE-NEXT: pmuludq %xmm1, %xmm0 1245; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] 1246; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] 1247; SSE-NEXT: pmuludq %xmm2, %xmm0 1248; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 1249; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 1250; SSE-NEXT: psubd %xmm3, %xmm4 1251; SSE-NEXT: movdqa %xmm4, %xmm0 1252; SSE-NEXT: psrld $31, %xmm0 1253; SSE-NEXT: paddd %xmm4, %xmm0 1254; SSE-NEXT: retq 1255; 1256; AVX-LABEL: PR20355: 1257; AVX: # BB#0: # %entry 1258; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 1259; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1260; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1261; AVX-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 1262; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 1263; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1264; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 1265; AVX-NEXT: vpsrld $31, %xmm0, %xmm1 1266; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1267; AVX-NEXT: retq 1268entry: 1269 %sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3> 1270 ret <4 x i32> %sdiv 1271} 1272 1273attributes #0 = { nounwind } 1274