1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 7 8define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) { 9; SSE-LABEL: mulhuw_v4i16: 10; SSE: # %bb.0: 11; SSE-NEXT: pmulhuw %xmm1, %xmm0 12; SSE-NEXT: retq 13; 14; AVX-LABEL: mulhuw_v4i16: 15; AVX: # %bb.0: 16; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 17; AVX-NEXT: retq 18 %a1 = zext <4 x i16> %a to <4 x i32> 19 %b1 = zext <4 x i16> %b to <4 x i32> 20 %c = mul <4 x i32> %a1, %b1 21 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 22 %e = trunc <4 x i32> %d to <4 x i16> 23 ret <4 x i16> %e 24} 25 26define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) { 27; SSE-LABEL: mulhw_v4i16: 28; SSE: # %bb.0: 29; SSE-NEXT: pmulhw %xmm1, %xmm0 30; SSE-NEXT: retq 31; 32; AVX-LABEL: mulhw_v4i16: 33; AVX: # %bb.0: 34; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 35; AVX-NEXT: retq 36 %a1 = sext <4 x i16> %a to <4 x i32> 37 %b1 = sext <4 x i16> %b to <4 x i32> 38 %c = mul <4 x i32> %a1, %b1 39 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 40 %e = trunc <4 x i32> %d to <4 x i16> 41 ret <4 x i16> %e 42} 43 44define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) { 45; SSE-LABEL: mulhuw_v8i16: 46; SSE: # %bb.0: 47; SSE-NEXT: pmulhuw %xmm1, %xmm0 48; SSE-NEXT: retq 49; 50; AVX-LABEL: mulhuw_v8i16: 51; AVX: # %bb.0: 52; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 53; AVX-NEXT: retq 54 %a1 = zext <8 x i16> %a to <8 x i32> 55 %b1 = zext <8 x i16> %b to <8 x i32> 56 %c = mul <8 x i32> %a1, %b1 57 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 58 %e = trunc <8 x i32> %d to <8 x i16> 59 ret <8 x i16> %e 60} 61 62define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) { 63; SSE-LABEL: mulhw_v8i16: 64; SSE: # %bb.0: 65; SSE-NEXT: pmulhw %xmm1, %xmm0 66; SSE-NEXT: retq 67; 68; AVX-LABEL: mulhw_v8i16: 69; AVX: # %bb.0: 70; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 71; AVX-NEXT: retq 72 %a1 = sext <8 x i16> %a to <8 x i32> 73 %b1 = sext <8 x i16> %b to <8 x i32> 74 %c = mul <8 x i32> %a1, %b1 75 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 76 %e = trunc <8 x i32> %d to <8 x i16> 77 ret <8 x i16> %e 78} 79 80define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) { 81; SSE-LABEL: mulhuw_v16i16: 82; SSE: # %bb.0: 83; SSE-NEXT: pmulhuw %xmm2, %xmm0 84; SSE-NEXT: pmulhuw %xmm3, %xmm1 85; SSE-NEXT: retq 86; 87; AVX-LABEL: mulhuw_v16i16: 88; AVX: # %bb.0: 89; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 90; AVX-NEXT: retq 91 %a1 = zext <16 x i16> %a to <16 x i32> 92 %b1 = zext <16 x i16> %b to <16 x i32> 93 %c = mul <16 x i32> %a1, %b1 94 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 95 %e = trunc <16 x i32> %d to <16 x i16> 96 ret <16 x i16> %e 97} 98 99define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) { 100; SSE-LABEL: mulhw_v16i16: 101; SSE: # %bb.0: 102; SSE-NEXT: pmulhw %xmm2, %xmm0 103; SSE-NEXT: pmulhw %xmm3, %xmm1 104; SSE-NEXT: retq 105; 106; AVX-LABEL: mulhw_v16i16: 107; AVX: # %bb.0: 108; AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 109; AVX-NEXT: retq 110 %a1 = sext <16 x i16> %a to <16 x i32> 111 %b1 = sext <16 x i16> %b to <16 x i32> 112 %c = mul <16 x i32> %a1, %b1 113 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 114 %e = trunc <16 x i32> %d to <16 x i16> 115 ret <16 x i16> %e 116} 117 118define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) { 119; SSE-LABEL: mulhuw_v32i16: 120; SSE: # %bb.0: 121; SSE-NEXT: pmulhuw %xmm4, %xmm0 122; SSE-NEXT: pmulhuw %xmm5, %xmm1 123; SSE-NEXT: pmulhuw %xmm6, %xmm2 124; SSE-NEXT: pmulhuw %xmm7, %xmm3 125; SSE-NEXT: retq 126; 127; AVX2-LABEL: mulhuw_v32i16: 128; AVX2: # %bb.0: 129; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 130; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 131; AVX2-NEXT: retq 132; 133; AVX512F-LABEL: mulhuw_v32i16: 134; AVX512F: # %bb.0: 135; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 136; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 137; AVX512F-NEXT: vpmulhuw %ymm2, %ymm3, %ymm2 138; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 139; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 140; AVX512F-NEXT: retq 141; 142; AVX512BW-LABEL: mulhuw_v32i16: 143; AVX512BW: # %bb.0: 144; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 145; AVX512BW-NEXT: retq 146 %a1 = zext <32 x i16> %a to <32 x i32> 147 %b1 = zext <32 x i16> %b to <32 x i32> 148 %c = mul <32 x i32> %a1, %b1 149 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 150 %e = trunc <32 x i32> %d to <32 x i16> 151 ret <32 x i16> %e 152} 153 154define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) { 155; SSE-LABEL: mulhw_v32i16: 156; SSE: # %bb.0: 157; SSE-NEXT: pmulhw %xmm4, %xmm0 158; SSE-NEXT: pmulhw %xmm5, %xmm1 159; SSE-NEXT: pmulhw %xmm6, %xmm2 160; SSE-NEXT: pmulhw %xmm7, %xmm3 161; SSE-NEXT: retq 162; 163; AVX2-LABEL: mulhw_v32i16: 164; AVX2: # %bb.0: 165; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 166; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 167; AVX2-NEXT: retq 168; 169; AVX512F-LABEL: mulhw_v32i16: 170; AVX512F: # %bb.0: 171; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 172; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 173; AVX512F-NEXT: vpmulhw %ymm2, %ymm3, %ymm2 174; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 175; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 176; AVX512F-NEXT: retq 177; 178; AVX512BW-LABEL: mulhw_v32i16: 179; AVX512BW: # %bb.0: 180; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 181; AVX512BW-NEXT: retq 182 %a1 = sext <32 x i16> %a to <32 x i32> 183 %b1 = sext <32 x i16> %b to <32 x i32> 184 %c = mul <32 x i32> %a1, %b1 185 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 186 %e = trunc <32 x i32> %d to <32 x i16> 187 ret <32 x i16> %e 188} 189 190define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { 191; SSE-LABEL: mulhuw_v64i16: 192; SSE: # %bb.0: 193; SSE-NEXT: movq %rdi, %rax 194; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 195; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 196; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 197; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 198; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 199; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 200; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 201; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 202; SSE-NEXT: movdqa %xmm7, 112(%rdi) 203; SSE-NEXT: movdqa %xmm6, 96(%rdi) 204; SSE-NEXT: movdqa %xmm5, 80(%rdi) 205; SSE-NEXT: movdqa %xmm4, 64(%rdi) 206; SSE-NEXT: movdqa %xmm3, 48(%rdi) 207; SSE-NEXT: movdqa %xmm2, 32(%rdi) 208; SSE-NEXT: movdqa %xmm1, 16(%rdi) 209; SSE-NEXT: movdqa %xmm0, (%rdi) 210; SSE-NEXT: retq 211; 212; AVX2-LABEL: mulhuw_v64i16: 213; AVX2: # %bb.0: 214; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0 215; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1 216; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2 217; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3 218; AVX2-NEXT: retq 219; 220; AVX512F-LABEL: mulhuw_v64i16: 221; AVX512F: # %bb.0: 222; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 223; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 224; AVX512F-NEXT: vpmulhuw %ymm4, %ymm5, %ymm4 225; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 226; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 227; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 228; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 229; AVX512F-NEXT: vpmulhuw %ymm2, %ymm4, %ymm2 230; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 231; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 232; AVX512F-NEXT: retq 233; 234; AVX512BW-LABEL: mulhuw_v64i16: 235; AVX512BW: # %bb.0: 236; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm0 237; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1 238; AVX512BW-NEXT: retq 239 %a1 = zext <64 x i16> %a to <64 x i32> 240 %b1 = zext <64 x i16> %b to <64 x i32> 241 %c = mul <64 x i32> %a1, %b1 242 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 243 %e = trunc <64 x i32> %d to <64 x i16> 244 ret <64 x i16> %e 245} 246 247define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) { 248; SSE-LABEL: mulhw_v64i16: 249; SSE: # %bb.0: 250; SSE-NEXT: movq %rdi, %rax 251; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 252; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 253; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 254; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 255; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 256; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 257; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 258; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 259; SSE-NEXT: movdqa %xmm7, 112(%rdi) 260; SSE-NEXT: movdqa %xmm6, 96(%rdi) 261; SSE-NEXT: movdqa %xmm5, 80(%rdi) 262; SSE-NEXT: movdqa %xmm4, 64(%rdi) 263; SSE-NEXT: movdqa %xmm3, 48(%rdi) 264; SSE-NEXT: movdqa %xmm2, 32(%rdi) 265; SSE-NEXT: movdqa %xmm1, 16(%rdi) 266; SSE-NEXT: movdqa %xmm0, (%rdi) 267; SSE-NEXT: retq 268; 269; AVX2-LABEL: mulhw_v64i16: 270; AVX2: # %bb.0: 271; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 272; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 273; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 274; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 275; AVX2-NEXT: retq 276; 277; AVX512F-LABEL: mulhw_v64i16: 278; AVX512F: # %bb.0: 279; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 280; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 281; AVX512F-NEXT: vpmulhw %ymm4, %ymm5, %ymm4 282; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 283; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 284; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 285; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 286; AVX512F-NEXT: vpmulhw %ymm2, %ymm4, %ymm2 287; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 288; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 289; AVX512F-NEXT: retq 290; 291; AVX512BW-LABEL: mulhw_v64i16: 292; AVX512BW: # %bb.0: 293; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm0 294; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1 295; AVX512BW-NEXT: retq 296 %a1 = sext <64 x i16> %a to <64 x i32> 297 %b1 = sext <64 x i16> %b to <64 x i32> 298 %c = mul <64 x i32> %a1, %b1 299 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 300 %e = trunc <64 x i32> %d to <64 x i16> 301 ret <64 x i16> %e 302} 303 304define <8 x i16> @mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) { 305; SSE-LABEL: mulhuw_v8i16_i64: 306; SSE: # %bb.0: 307; SSE-NEXT: pmulhuw %xmm1, %xmm0 308; SSE-NEXT: retq 309; 310; AVX-LABEL: mulhuw_v8i16_i64: 311; AVX: # %bb.0: 312; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 313; AVX-NEXT: retq 314 %a1 = zext <8 x i16> %a to <8 x i64> 315 %b1 = zext <8 x i16> %b to <8 x i64> 316 %c = mul <8 x i64> %a1, %b1 317 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 318 %e = trunc <8 x i64> %d to <8 x i16> 319 ret <8 x i16> %e 320} 321 322define <8 x i16> @mulhw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) { 323; SSE-LABEL: mulhw_v8i16_i64: 324; SSE: # %bb.0: 325; SSE-NEXT: pmulhw %xmm1, %xmm0 326; SSE-NEXT: retq 327; 328; AVX-LABEL: mulhw_v8i16_i64: 329; AVX: # %bb.0: 330; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 331; AVX-NEXT: retq 332 %a1 = sext <8 x i16> %a to <8 x i64> 333 %b1 = sext <8 x i16> %b to <8 x i64> 334 %c = mul <8 x i64> %a1, %b1 335 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 336 %e = trunc <8 x i64> %d to <8 x i16> 337 ret <8 x i16> %e 338} 339 340define <4 x i32> @mulhuw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) { 341; SSE2-LABEL: mulhuw_v4i16_lshr: 342; SSE2: # %bb.0: 343; SSE2-NEXT: pmulhuw %xmm1, %xmm0 344; SSE2-NEXT: pxor %xmm1, %xmm1 345; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 346; SSE2-NEXT: retq 347; 348; SSE41-LABEL: mulhuw_v4i16_lshr: 349; SSE41: # %bb.0: 350; SSE41-NEXT: pmulhuw %xmm1, %xmm0 351; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 352; SSE41-NEXT: retq 353; 354; AVX-LABEL: mulhuw_v4i16_lshr: 355; AVX: # %bb.0: 356; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 357; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 358; AVX-NEXT: retq 359 %a1 = zext <4 x i16> %a to <4 x i32> 360 %b1 = zext <4 x i16> %b to <4 x i32> 361 %c = mul <4 x i32> %a1, %b1 362 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 363 ret <4 x i32> %d 364} 365 366define <4 x i32> @mulhsw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) { 367; SSE2-LABEL: mulhsw_v4i16_lshr: 368; SSE2: # %bb.0: 369; SSE2-NEXT: pmulhw %xmm1, %xmm0 370; SSE2-NEXT: pxor %xmm1, %xmm1 371; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 372; SSE2-NEXT: retq 373; 374; SSE41-LABEL: mulhsw_v4i16_lshr: 375; SSE41: # %bb.0: 376; SSE41-NEXT: pmulhw %xmm1, %xmm0 377; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 378; SSE41-NEXT: retq 379; 380; AVX-LABEL: mulhsw_v4i16_lshr: 381; AVX: # %bb.0: 382; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 383; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 384; AVX-NEXT: retq 385 %a1 = sext <4 x i16> %a to <4 x i32> 386 %b1 = sext <4 x i16> %b to <4 x i32> 387 %c = mul <4 x i32> %a1, %b1 388 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 389 ret <4 x i32> %d 390} 391 392define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) { 393; SSE2-LABEL: mulhsw_v4i16_ashr: 394; SSE2: # %bb.0: 395; SSE2-NEXT: pmulhw %xmm1, %xmm0 396; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] 397; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 398; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 399; SSE2-NEXT: psrad $16, %xmm0 400; SSE2-NEXT: retq 401; 402; SSE41-LABEL: mulhsw_v4i16_ashr: 403; SSE41: # %bb.0: 404; SSE41-NEXT: pmulhw %xmm1, %xmm0 405; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 406; SSE41-NEXT: retq 407; 408; AVX-LABEL: mulhsw_v4i16_ashr: 409; AVX: # %bb.0: 410; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 411; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 412; AVX-NEXT: retq 413 %a1 = sext <4 x i16> %a to <4 x i32> 414 %b1 = sext <4 x i16> %b to <4 x i32> 415 %c = mul <4 x i32> %a1, %b1 416 %d = ashr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 417 ret <4 x i32> %d 418} 419 420define <8 x i32> @mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { 421; SSE2-LABEL: mulhuw_v8i16_lshr: 422; SSE2: # %bb.0: 423; SSE2-NEXT: movdqa %xmm0, %xmm2 424; SSE2-NEXT: pmulhuw %xmm1, %xmm2 425; SSE2-NEXT: pxor %xmm1, %xmm1 426; SSE2-NEXT: movdqa %xmm2, %xmm0 427; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 428; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 429; SSE2-NEXT: movdqa %xmm2, %xmm1 430; SSE2-NEXT: retq 431; 432; SSE41-LABEL: mulhuw_v8i16_lshr: 433; SSE41: # %bb.0: 434; SSE41-NEXT: movdqa %xmm0, %xmm2 435; SSE41-NEXT: pmulhuw %xmm1, %xmm2 436; SSE41-NEXT: pxor %xmm1, %xmm1 437; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 438; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 439; SSE41-NEXT: movdqa %xmm2, %xmm1 440; SSE41-NEXT: retq 441; 442; AVX-LABEL: mulhuw_v8i16_lshr: 443; AVX: # %bb.0: 444; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 445; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 446; AVX-NEXT: retq 447 %a1 = zext <8 x i16> %a to <8 x i32> 448 %b1 = zext <8 x i16> %b to <8 x i32> 449 %c = mul <8 x i32> %a1, %b1 450 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 451 ret <8 x i32> %d 452} 453 454define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { 455; SSE2-LABEL: mulhsw_v8i16_lshr: 456; SSE2: # %bb.0: 457; SSE2-NEXT: movdqa %xmm0, %xmm2 458; SSE2-NEXT: pmulhw %xmm1, %xmm2 459; SSE2-NEXT: pxor %xmm1, %xmm1 460; SSE2-NEXT: movdqa %xmm2, %xmm0 461; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 462; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 463; SSE2-NEXT: movdqa %xmm2, %xmm1 464; SSE2-NEXT: retq 465; 466; SSE41-LABEL: mulhsw_v8i16_lshr: 467; SSE41: # %bb.0: 468; SSE41-NEXT: movdqa %xmm0, %xmm2 469; SSE41-NEXT: pmulhw %xmm1, %xmm2 470; SSE41-NEXT: pxor %xmm1, %xmm1 471; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 472; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 473; SSE41-NEXT: movdqa %xmm2, %xmm1 474; SSE41-NEXT: retq 475; 476; AVX-LABEL: mulhsw_v8i16_lshr: 477; AVX: # %bb.0: 478; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 479; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 480; AVX-NEXT: retq 481 %a1 = sext <8 x i16> %a to <8 x i32> 482 %b1 = sext <8 x i16> %b to <8 x i32> 483 %c = mul <8 x i32> %a1, %b1 484 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 485 ret <8 x i32> %d 486} 487 488define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) { 489; SSE2-LABEL: mulhsw_v8i16_ashr: 490; SSE2: # %bb.0: 491; SSE2-NEXT: pmulhw %xmm1, %xmm0 492; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 493; SSE2-NEXT: psrad $16, %xmm2 494; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 495; SSE2-NEXT: psrad $16, %xmm1 496; SSE2-NEXT: movdqa %xmm2, %xmm0 497; SSE2-NEXT: retq 498; 499; SSE41-LABEL: mulhsw_v8i16_ashr: 500; SSE41: # %bb.0: 501; SSE41-NEXT: pmulhw %xmm1, %xmm0 502; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 503; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 504; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 505; SSE41-NEXT: movdqa %xmm2, %xmm0 506; SSE41-NEXT: retq 507; 508; AVX-LABEL: mulhsw_v8i16_ashr: 509; AVX: # %bb.0: 510; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 511; AVX-NEXT: vpmovsxwd %xmm0, %ymm0 512; AVX-NEXT: retq 513 %a1 = sext <8 x i16> %a to <8 x i32> 514 %b1 = sext <8 x i16> %b to <8 x i32> 515 %c = mul <8 x i32> %a1, %b1 516 %d = ashr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 517 ret <8 x i32> %d 518} 519 520define <16 x i32> @mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { 521; SSE2-LABEL: mulhuw_v16i16_lshr: 522; SSE2: # %bb.0: 523; SSE2-NEXT: movdqa %xmm1, %xmm4 524; SSE2-NEXT: movdqa %xmm0, %xmm1 525; SSE2-NEXT: pmulhuw %xmm3, %xmm4 526; SSE2-NEXT: pmulhuw %xmm2, %xmm1 527; SSE2-NEXT: pxor %xmm3, %xmm3 528; SSE2-NEXT: movdqa %xmm1, %xmm0 529; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 530; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 531; SSE2-NEXT: movdqa %xmm4, %xmm2 532; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 533; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 534; SSE2-NEXT: movdqa %xmm4, %xmm3 535; SSE2-NEXT: retq 536; 537; SSE41-LABEL: mulhuw_v16i16_lshr: 538; SSE41: # %bb.0: 539; SSE41-NEXT: movdqa %xmm1, %xmm4 540; SSE41-NEXT: movdqa %xmm0, %xmm1 541; SSE41-NEXT: pmulhuw %xmm2, %xmm1 542; SSE41-NEXT: pxor %xmm5, %xmm5 543; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 544; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 545; SSE41-NEXT: pmulhuw %xmm3, %xmm4 546; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 547; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 548; SSE41-NEXT: movdqa %xmm4, %xmm3 549; SSE41-NEXT: retq 550; 551; AVX2-LABEL: mulhuw_v16i16_lshr: 552; AVX2: # %bb.0: 553; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1 554; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 555; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 556; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 557; AVX2-NEXT: retq 558; 559; AVX512-LABEL: mulhuw_v16i16_lshr: 560; AVX512: # %bb.0: 561; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 562; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 563; AVX512-NEXT: retq 564 %a1 = zext <16 x i16> %a to <16 x i32> 565 %b1 = zext <16 x i16> %b to <16 x i32> 566 %c = mul <16 x i32> %a1, %b1 567 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 568 ret <16 x i32> %d 569} 570 571define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { 572; SSE2-LABEL: mulhsw_v16i16_lshr: 573; SSE2: # %bb.0: 574; SSE2-NEXT: movdqa %xmm1, %xmm4 575; SSE2-NEXT: movdqa %xmm0, %xmm1 576; SSE2-NEXT: pmulhw %xmm3, %xmm4 577; SSE2-NEXT: pmulhw %xmm2, %xmm1 578; SSE2-NEXT: pxor %xmm3, %xmm3 579; SSE2-NEXT: movdqa %xmm1, %xmm0 580; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 581; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 582; SSE2-NEXT: movdqa %xmm4, %xmm2 583; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 584; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 585; SSE2-NEXT: movdqa %xmm4, %xmm3 586; SSE2-NEXT: retq 587; 588; SSE41-LABEL: mulhsw_v16i16_lshr: 589; SSE41: # %bb.0: 590; SSE41-NEXT: movdqa %xmm1, %xmm4 591; SSE41-NEXT: movdqa %xmm0, %xmm1 592; SSE41-NEXT: pmulhw %xmm2, %xmm1 593; SSE41-NEXT: pxor %xmm5, %xmm5 594; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 595; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 596; SSE41-NEXT: pmulhw %xmm3, %xmm4 597; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 598; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 599; SSE41-NEXT: movdqa %xmm4, %xmm3 600; SSE41-NEXT: retq 601; 602; AVX2-LABEL: mulhsw_v16i16_lshr: 603; AVX2: # %bb.0: 604; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm1 605; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 606; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 607; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 608; AVX2-NEXT: retq 609; 610; AVX512-LABEL: mulhsw_v16i16_lshr: 611; AVX512: # %bb.0: 612; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 613; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 614; AVX512-NEXT: retq 615 %a1 = sext <16 x i16> %a to <16 x i32> 616 %b1 = sext <16 x i16> %b to <16 x i32> 617 %c = mul <16 x i32> %a1, %b1 618 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 619 ret <16 x i32> %d 620} 621 622define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) { 623; SSE2-LABEL: mulhsw_v16i16_ashr: 624; SSE2: # %bb.0: 625; SSE2-NEXT: pmulhw %xmm3, %xmm1 626; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 627; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 628; SSE2-NEXT: pmulhw %xmm2, %xmm0 629; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 630; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 631; SSE2-NEXT: psrad $16, %xmm0 632; SSE2-NEXT: psrad $16, %xmm1 633; SSE2-NEXT: psrad $16, %xmm4 634; SSE2-NEXT: psrad $16, %xmm3 635; SSE2-NEXT: movdqa %xmm4, %xmm2 636; SSE2-NEXT: retq 637; 638; SSE41-LABEL: mulhsw_v16i16_ashr: 639; SSE41: # %bb.0: 640; SSE41-NEXT: pmulhw %xmm2, %xmm0 641; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 642; SSE41-NEXT: pmulhw %xmm3, %xmm1 643; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 644; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 645; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 646; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 647; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 648; SSE41-NEXT: movdqa %xmm4, %xmm0 649; SSE41-NEXT: movdqa %xmm5, %xmm1 650; SSE41-NEXT: retq 651; 652; AVX2-LABEL: mulhsw_v16i16_ashr: 653; AVX2: # %bb.0: 654; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm1 655; AVX2-NEXT: vpmovsxwd %xmm1, %ymm0 656; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 657; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 658; AVX2-NEXT: retq 659; 660; AVX512-LABEL: mulhsw_v16i16_ashr: 661; AVX512: # %bb.0: 662; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 663; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 664; AVX512-NEXT: retq 665 %a1 = sext <16 x i16> %a to <16 x i32> 666 %b1 = sext <16 x i16> %b to <16 x i32> 667 %c = mul <16 x i32> %a1, %b1 668 %d = ashr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 669 ret <16 x i32> %d 670} 671 672define <32 x i32> @mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { 673; SSE2-LABEL: mulhuw_v32i16_lshr: 674; SSE2: # %bb.0: 675; SSE2-NEXT: movq %rdi, %rax 676; SSE2-NEXT: pmulhuw %xmm7, %xmm3 677; SSE2-NEXT: pmulhuw %xmm6, %xmm2 678; SSE2-NEXT: pmulhuw %xmm5, %xmm1 679; SSE2-NEXT: pmulhuw %xmm4, %xmm0 680; SSE2-NEXT: pxor %xmm4, %xmm4 681; SSE2-NEXT: movdqa %xmm0, %xmm8 682; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] 683; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 684; SSE2-NEXT: movdqa %xmm1, %xmm6 685; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 686; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 687; SSE2-NEXT: movdqa %xmm2, %xmm7 688; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] 689; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 690; SSE2-NEXT: movdqa %xmm3, %xmm5 691; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 692; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 693; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 694; SSE2-NEXT: movdqa %xmm5, 96(%rdi) 695; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 696; SSE2-NEXT: movdqa %xmm7, 64(%rdi) 697; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 698; SSE2-NEXT: movdqa %xmm6, 32(%rdi) 699; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 700; SSE2-NEXT: movdqa %xmm8, (%rdi) 701; SSE2-NEXT: retq 702; 703; SSE41-LABEL: mulhuw_v32i16_lshr: 704; SSE41: # %bb.0: 705; SSE41-NEXT: movq %rdi, %rax 706; SSE41-NEXT: pmulhuw %xmm4, %xmm0 707; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 708; SSE41-NEXT: pxor %xmm4, %xmm4 709; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 710; SSE41-NEXT: pmulhuw %xmm5, %xmm1 711; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 712; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 713; SSE41-NEXT: pmulhuw %xmm6, %xmm2 714; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 715; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 716; SSE41-NEXT: pmulhuw %xmm7, %xmm3 717; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 718; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 719; SSE41-NEXT: movdqa %xmm3, 112(%rdi) 720; SSE41-NEXT: movdqa %xmm7, 96(%rdi) 721; SSE41-NEXT: movdqa %xmm2, 80(%rdi) 722; SSE41-NEXT: movdqa %xmm6, 64(%rdi) 723; SSE41-NEXT: movdqa %xmm1, 48(%rdi) 724; SSE41-NEXT: movdqa %xmm5, 32(%rdi) 725; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 726; SSE41-NEXT: movdqa %xmm8, (%rdi) 727; SSE41-NEXT: retq 728; 729; AVX2-LABEL: mulhuw_v32i16_lshr: 730; AVX2: # %bb.0: 731; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 732; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 733; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 734; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 735; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 736; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 737; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 738; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 739; AVX2-NEXT: vmovdqa %ymm4, %ymm1 740; AVX2-NEXT: retq 741; 742; AVX512F-LABEL: mulhuw_v32i16_lshr: 743; AVX512F: # %bb.0: 744; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 745; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 746; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 747; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 748; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 749; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 750; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 751; AVX512F-NEXT: retq 752; 753; AVX512BW-LABEL: mulhuw_v32i16_lshr: 754; AVX512BW: # %bb.0: 755; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm1 756; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 757; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 758; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 759; AVX512BW-NEXT: retq 760 %a1 = zext <32 x i16> %a to <32 x i32> 761 %b1 = zext <32 x i16> %b to <32 x i32> 762 %c = mul <32 x i32> %a1, %b1 763 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 764 ret <32 x i32> %d 765} 766 767define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { 768; SSE2-LABEL: mulhsw_v32i16_lshr: 769; SSE2: # %bb.0: 770; SSE2-NEXT: movq %rdi, %rax 771; SSE2-NEXT: pmulhw %xmm7, %xmm3 772; SSE2-NEXT: pmulhw %xmm6, %xmm2 773; SSE2-NEXT: pmulhw %xmm5, %xmm1 774; SSE2-NEXT: pmulhw %xmm4, %xmm0 775; SSE2-NEXT: pxor %xmm4, %xmm4 776; SSE2-NEXT: movdqa %xmm0, %xmm8 777; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] 778; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 779; SSE2-NEXT: movdqa %xmm1, %xmm6 780; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 781; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 782; SSE2-NEXT: movdqa %xmm2, %xmm7 783; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] 784; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 785; SSE2-NEXT: movdqa %xmm3, %xmm5 786; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 787; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 788; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 789; SSE2-NEXT: movdqa %xmm5, 96(%rdi) 790; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 791; SSE2-NEXT: movdqa %xmm7, 64(%rdi) 792; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 793; SSE2-NEXT: movdqa %xmm6, 32(%rdi) 794; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 795; SSE2-NEXT: movdqa %xmm8, (%rdi) 796; SSE2-NEXT: retq 797; 798; SSE41-LABEL: mulhsw_v32i16_lshr: 799; SSE41: # %bb.0: 800; SSE41-NEXT: movq %rdi, %rax 801; SSE41-NEXT: pmulhw %xmm4, %xmm0 802; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 803; SSE41-NEXT: pxor %xmm4, %xmm4 804; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 805; SSE41-NEXT: pmulhw %xmm5, %xmm1 806; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 807; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 808; SSE41-NEXT: pmulhw %xmm6, %xmm2 809; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 810; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 811; SSE41-NEXT: pmulhw %xmm7, %xmm3 812; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 813; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 814; SSE41-NEXT: movdqa %xmm3, 112(%rdi) 815; SSE41-NEXT: movdqa %xmm7, 96(%rdi) 816; SSE41-NEXT: movdqa %xmm2, 80(%rdi) 817; SSE41-NEXT: movdqa %xmm6, 64(%rdi) 818; SSE41-NEXT: movdqa %xmm1, 48(%rdi) 819; SSE41-NEXT: movdqa %xmm5, 32(%rdi) 820; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 821; SSE41-NEXT: movdqa %xmm8, (%rdi) 822; SSE41-NEXT: retq 823; 824; AVX2-LABEL: mulhsw_v32i16_lshr: 825; AVX2: # %bb.0: 826; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 827; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 828; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 829; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 830; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 831; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 832; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 833; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 834; AVX2-NEXT: vmovdqa %ymm4, %ymm1 835; AVX2-NEXT: retq 836; 837; AVX512F-LABEL: mulhsw_v32i16_lshr: 838; AVX512F: # %bb.0: 839; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 840; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 841; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 842; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 843; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 844; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 845; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 846; AVX512F-NEXT: retq 847; 848; AVX512BW-LABEL: mulhsw_v32i16_lshr: 849; AVX512BW: # %bb.0: 850; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm1 851; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 852; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 853; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 854; AVX512BW-NEXT: retq 855 %a1 = sext <32 x i16> %a to <32 x i32> 856 %b1 = sext <32 x i16> %b to <32 x i32> 857 %c = mul <32 x i32> %a1, %b1 858 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 859 ret <32 x i32> %d 860} 861 862define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) { 863; SSE2-LABEL: mulhsw_v32i16_ashr: 864; SSE2: # %bb.0: 865; SSE2-NEXT: movq %rdi, %rax 866; SSE2-NEXT: pmulhw %xmm7, %xmm3 867; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 868; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 869; SSE2-NEXT: pmulhw %xmm6, %xmm2 870; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 871; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 872; SSE2-NEXT: pmulhw %xmm5, %xmm1 873; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 874; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 875; SSE2-NEXT: pmulhw %xmm4, %xmm0 876; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 877; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 878; SSE2-NEXT: psrad $16, %xmm0 879; SSE2-NEXT: psrad $16, %xmm4 880; SSE2-NEXT: psrad $16, %xmm1 881; SSE2-NEXT: psrad $16, %xmm5 882; SSE2-NEXT: psrad $16, %xmm2 883; SSE2-NEXT: psrad $16, %xmm6 884; SSE2-NEXT: psrad $16, %xmm3 885; SSE2-NEXT: psrad $16, %xmm7 886; SSE2-NEXT: movdqa %xmm7, 112(%rdi) 887; SSE2-NEXT: movdqa %xmm3, 96(%rdi) 888; SSE2-NEXT: movdqa %xmm6, 80(%rdi) 889; SSE2-NEXT: movdqa %xmm2, 64(%rdi) 890; SSE2-NEXT: movdqa %xmm5, 48(%rdi) 891; SSE2-NEXT: movdqa %xmm1, 32(%rdi) 892; SSE2-NEXT: movdqa %xmm4, 16(%rdi) 893; SSE2-NEXT: movdqa %xmm0, (%rdi) 894; SSE2-NEXT: retq 895; 896; SSE41-LABEL: mulhsw_v32i16_ashr: 897; SSE41: # %bb.0: 898; SSE41-NEXT: movq %rdi, %rax 899; SSE41-NEXT: pmulhw %xmm4, %xmm0 900; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 901; SSE41-NEXT: pmovsxwd %xmm4, %xmm4 902; SSE41-NEXT: pmulhw %xmm5, %xmm1 903; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 904; SSE41-NEXT: pmovsxwd %xmm5, %xmm5 905; SSE41-NEXT: pmulhw %xmm6, %xmm2 906; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] 907; SSE41-NEXT: pmovsxwd %xmm6, %xmm6 908; SSE41-NEXT: pmulhw %xmm7, %xmm3 909; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] 910; SSE41-NEXT: pmovsxwd %xmm7, %xmm7 911; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 912; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 913; SSE41-NEXT: pmovsxwd %xmm2, %xmm2 914; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 915; SSE41-NEXT: movdqa %xmm3, 96(%rdi) 916; SSE41-NEXT: movdqa %xmm2, 64(%rdi) 917; SSE41-NEXT: movdqa %xmm1, 32(%rdi) 918; SSE41-NEXT: movdqa %xmm0, (%rdi) 919; SSE41-NEXT: movdqa %xmm7, 112(%rdi) 920; SSE41-NEXT: movdqa %xmm6, 80(%rdi) 921; SSE41-NEXT: movdqa %xmm5, 48(%rdi) 922; SSE41-NEXT: movdqa %xmm4, 16(%rdi) 923; SSE41-NEXT: retq 924; 925; AVX2-LABEL: mulhsw_v32i16_ashr: 926; AVX2: # %bb.0: 927; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 928; AVX2-NEXT: vpmovsxwd %xmm2, %ymm0 929; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 930; AVX2-NEXT: vpmovsxwd %xmm2, %ymm4 931; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 932; AVX2-NEXT: vpmovsxwd %xmm1, %ymm2 933; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 934; AVX2-NEXT: vpmovsxwd %xmm1, %ymm3 935; AVX2-NEXT: vmovdqa %ymm4, %ymm1 936; AVX2-NEXT: retq 937; 938; AVX512F-LABEL: mulhsw_v32i16_ashr: 939; AVX512F: # %bb.0: 940; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 941; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 942; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 943; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 944; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 945; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm1 946; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 947; AVX512F-NEXT: retq 948; 949; AVX512BW-LABEL: mulhsw_v32i16_ashr: 950; AVX512BW: # %bb.0: 951; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm1 952; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm0 953; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 954; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm1 955; AVX512BW-NEXT: retq 956 %a1 = sext <32 x i16> %a to <32 x i32> 957 %b1 = sext <32 x i16> %b to <32 x i32> 958 %c = mul <32 x i32> %a1, %b1 959 %d = ashr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 960 ret <32 x i32> %d 961} 962 963define <64 x i32> @mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { 964; SSE2-LABEL: mulhuw_v64i16_lshr: 965; SSE2: # %bb.0: 966; SSE2-NEXT: movdqa %xmm7, %xmm8 967; SSE2-NEXT: movq %rdi, %rax 968; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 969; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 970; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 971; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 972; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 973; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 974; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 975; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 976; SSE2-NEXT: pxor %xmm11, %xmm11 977; SSE2-NEXT: movdqa %xmm0, %xmm7 978; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] 979; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 980; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 981; SSE2-NEXT: movdqa %xmm1, %xmm9 982; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] 983; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] 984; SSE2-NEXT: movdqa %xmm2, %xmm10 985; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] 986; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] 987; SSE2-NEXT: movdqa %xmm3, %xmm12 988; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 989; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] 990; SSE2-NEXT: movdqa %xmm4, %xmm13 991; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] 992; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] 993; SSE2-NEXT: movdqa %xmm5, %xmm14 994; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] 995; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] 996; SSE2-NEXT: movdqa %xmm6, %xmm15 997; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] 998; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] 999; SSE2-NEXT: movdqa %xmm8, %xmm7 1000; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] 1001; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] 1002; SSE2-NEXT: movdqa %xmm8, 240(%rdi) 1003; SSE2-NEXT: movdqa %xmm7, 224(%rdi) 1004; SSE2-NEXT: movdqa %xmm6, 208(%rdi) 1005; SSE2-NEXT: movdqa %xmm15, 192(%rdi) 1006; SSE2-NEXT: movdqa %xmm5, 176(%rdi) 1007; SSE2-NEXT: movdqa %xmm14, 160(%rdi) 1008; SSE2-NEXT: movdqa %xmm4, 144(%rdi) 1009; SSE2-NEXT: movdqa %xmm13, 128(%rdi) 1010; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 1011; SSE2-NEXT: movdqa %xmm12, 96(%rdi) 1012; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 1013; SSE2-NEXT: movdqa %xmm10, 64(%rdi) 1014; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1015; SSE2-NEXT: movdqa %xmm9, 32(%rdi) 1016; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1017; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1018; SSE2-NEXT: movaps %xmm0, (%rdi) 1019; SSE2-NEXT: retq 1020; 1021; SSE41-LABEL: mulhuw_v64i16_lshr: 1022; SSE41: # %bb.0: 1023; SSE41-NEXT: movdqa %xmm0, %xmm8 1024; SSE41-NEXT: movq %rdi, %rax 1025; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 1026; SSE41-NEXT: pxor %xmm11, %xmm11 1027; SSE41-NEXT: movdqa %xmm8, %xmm0 1028; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 1029; SSE41-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1030; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 1031; SSE41-NEXT: movdqa %xmm1, %xmm9 1032; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] 1033; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 1034; SSE41-NEXT: movdqa %xmm2, %xmm10 1035; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 1036; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 1037; SSE41-NEXT: movdqa %xmm3, %xmm12 1038; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 1039; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 1040; SSE41-NEXT: movdqa %xmm4, %xmm13 1041; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 1042; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 1043; SSE41-NEXT: movdqa %xmm5, %xmm14 1044; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1045; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 1046; SSE41-NEXT: movdqa %xmm6, %xmm15 1047; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] 1048; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 1049; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero 1050; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] 1051; SSE41-NEXT: movdqa %xmm7, 240(%rdi) 1052; SSE41-NEXT: movdqa %xmm0, 224(%rdi) 1053; SSE41-NEXT: movdqa %xmm15, 208(%rdi) 1054; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero 1055; SSE41-NEXT: movdqa %xmm0, 192(%rdi) 1056; SSE41-NEXT: movdqa %xmm14, 176(%rdi) 1057; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 1058; SSE41-NEXT: movdqa %xmm0, 160(%rdi) 1059; SSE41-NEXT: movdqa %xmm13, 144(%rdi) 1060; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 1061; SSE41-NEXT: movdqa %xmm0, 128(%rdi) 1062; SSE41-NEXT: movdqa %xmm12, 112(%rdi) 1063; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1064; SSE41-NEXT: movdqa %xmm0, 96(%rdi) 1065; SSE41-NEXT: movdqa %xmm10, 80(%rdi) 1066; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1067; SSE41-NEXT: movdqa %xmm0, 64(%rdi) 1068; SSE41-NEXT: movdqa %xmm9, 48(%rdi) 1069; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1070; SSE41-NEXT: movdqa %xmm0, 32(%rdi) 1071; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1072; SSE41-NEXT: movaps %xmm0, 16(%rdi) 1073; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero 1074; SSE41-NEXT: movdqa %xmm0, (%rdi) 1075; SSE41-NEXT: retq 1076; 1077; AVX2-LABEL: mulhuw_v64i16_lshr: 1078; AVX2: # %bb.0: 1079; AVX2-NEXT: movq %rdi, %rax 1080; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0 1081; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1082; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1083; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1084; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1 1085; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1086; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1087; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1088; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2 1089; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1090; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1091; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1092; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3 1093; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1094; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 1095; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1096; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) 1097; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi) 1098; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi) 1099; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi) 1100; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi) 1101; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 1102; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) 1103; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1104; AVX2-NEXT: vzeroupper 1105; AVX2-NEXT: retq 1106; 1107; AVX512F-LABEL: mulhuw_v64i16_lshr: 1108; AVX512F: # %bb.0: 1109; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4 1110; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 1111; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1112; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1113; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 1114; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1115; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm0 1116; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1117; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1118; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1119; AVX512F-NEXT: vpmulhuw %ymm0, %ymm1, %ymm0 1120; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1121; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 1122; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 1123; AVX512F-NEXT: retq 1124; 1125; AVX512BW-LABEL: mulhuw_v64i16_lshr: 1126; AVX512BW: # %bb.0: 1127; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm2 1128; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1129; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1130; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1131; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1 1132; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1133; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1134; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1135; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 1136; AVX512BW-NEXT: retq 1137 %a1 = zext <64 x i16> %a to <64 x i32> 1138 %b1 = zext <64 x i16> %b to <64 x i32> 1139 %c = mul <64 x i32> %a1, %b1 1140 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1141 ret <64 x i32> %d 1142} 1143 1144define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { 1145; SSE2-LABEL: mulhsw_v64i16_lshr: 1146; SSE2: # %bb.0: 1147; SSE2-NEXT: movdqa %xmm7, %xmm8 1148; SSE2-NEXT: movq %rdi, %rax 1149; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 1150; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 1151; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 1152; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 1153; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 1154; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 1155; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 1156; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 1157; SSE2-NEXT: pxor %xmm11, %xmm11 1158; SSE2-NEXT: movdqa %xmm0, %xmm7 1159; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] 1160; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1161; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 1162; SSE2-NEXT: movdqa %xmm1, %xmm9 1163; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] 1164; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] 1165; SSE2-NEXT: movdqa %xmm2, %xmm10 1166; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] 1167; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] 1168; SSE2-NEXT: movdqa %xmm3, %xmm12 1169; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 1170; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] 1171; SSE2-NEXT: movdqa %xmm4, %xmm13 1172; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] 1173; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] 1174; SSE2-NEXT: movdqa %xmm5, %xmm14 1175; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] 1176; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] 1177; SSE2-NEXT: movdqa %xmm6, %xmm15 1178; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] 1179; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] 1180; SSE2-NEXT: movdqa %xmm8, %xmm7 1181; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] 1182; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] 1183; SSE2-NEXT: movdqa %xmm8, 240(%rdi) 1184; SSE2-NEXT: movdqa %xmm7, 224(%rdi) 1185; SSE2-NEXT: movdqa %xmm6, 208(%rdi) 1186; SSE2-NEXT: movdqa %xmm15, 192(%rdi) 1187; SSE2-NEXT: movdqa %xmm5, 176(%rdi) 1188; SSE2-NEXT: movdqa %xmm14, 160(%rdi) 1189; SSE2-NEXT: movdqa %xmm4, 144(%rdi) 1190; SSE2-NEXT: movdqa %xmm13, 128(%rdi) 1191; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 1192; SSE2-NEXT: movdqa %xmm12, 96(%rdi) 1193; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 1194; SSE2-NEXT: movdqa %xmm10, 64(%rdi) 1195; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1196; SSE2-NEXT: movdqa %xmm9, 32(%rdi) 1197; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1198; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1199; SSE2-NEXT: movaps %xmm0, (%rdi) 1200; SSE2-NEXT: retq 1201; 1202; SSE41-LABEL: mulhsw_v64i16_lshr: 1203; SSE41: # %bb.0: 1204; SSE41-NEXT: movdqa %xmm0, %xmm8 1205; SSE41-NEXT: movq %rdi, %rax 1206; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 1207; SSE41-NEXT: pxor %xmm11, %xmm11 1208; SSE41-NEXT: movdqa %xmm8, %xmm0 1209; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 1210; SSE41-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1211; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 1212; SSE41-NEXT: movdqa %xmm1, %xmm9 1213; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] 1214; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 1215; SSE41-NEXT: movdqa %xmm2, %xmm10 1216; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 1217; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 1218; SSE41-NEXT: movdqa %xmm3, %xmm12 1219; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 1220; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 1221; SSE41-NEXT: movdqa %xmm4, %xmm13 1222; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 1223; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 1224; SSE41-NEXT: movdqa %xmm5, %xmm14 1225; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1226; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 1227; SSE41-NEXT: movdqa %xmm6, %xmm15 1228; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] 1229; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 1230; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero 1231; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] 1232; SSE41-NEXT: movdqa %xmm7, 240(%rdi) 1233; SSE41-NEXT: movdqa %xmm0, 224(%rdi) 1234; SSE41-NEXT: movdqa %xmm15, 208(%rdi) 1235; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero 1236; SSE41-NEXT: movdqa %xmm0, 192(%rdi) 1237; SSE41-NEXT: movdqa %xmm14, 176(%rdi) 1238; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 1239; SSE41-NEXT: movdqa %xmm0, 160(%rdi) 1240; SSE41-NEXT: movdqa %xmm13, 144(%rdi) 1241; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 1242; SSE41-NEXT: movdqa %xmm0, 128(%rdi) 1243; SSE41-NEXT: movdqa %xmm12, 112(%rdi) 1244; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1245; SSE41-NEXT: movdqa %xmm0, 96(%rdi) 1246; SSE41-NEXT: movdqa %xmm10, 80(%rdi) 1247; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1248; SSE41-NEXT: movdqa %xmm0, 64(%rdi) 1249; SSE41-NEXT: movdqa %xmm9, 48(%rdi) 1250; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1251; SSE41-NEXT: movdqa %xmm0, 32(%rdi) 1252; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1253; SSE41-NEXT: movaps %xmm0, 16(%rdi) 1254; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero 1255; SSE41-NEXT: movdqa %xmm0, (%rdi) 1256; SSE41-NEXT: retq 1257; 1258; AVX2-LABEL: mulhsw_v64i16_lshr: 1259; AVX2: # %bb.0: 1260; AVX2-NEXT: movq %rdi, %rax 1261; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 1262; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1263; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1264; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1265; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 1266; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1267; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1268; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1269; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 1270; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1271; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1272; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1273; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 1274; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1275; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 1276; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1277; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) 1278; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi) 1279; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi) 1280; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi) 1281; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi) 1282; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 1283; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) 1284; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1285; AVX2-NEXT: vzeroupper 1286; AVX2-NEXT: retq 1287; 1288; AVX512F-LABEL: mulhsw_v64i16_lshr: 1289; AVX512F: # %bb.0: 1290; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm4 1291; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 1292; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1293; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1294; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 1295; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1296; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm0 1297; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1298; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1299; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1300; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0 1301; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1302; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 1303; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 1304; AVX512F-NEXT: retq 1305; 1306; AVX512BW-LABEL: mulhsw_v64i16_lshr: 1307; AVX512BW: # %bb.0: 1308; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm2 1309; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1310; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1311; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1312; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1 1313; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1314; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1315; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1316; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 1317; AVX512BW-NEXT: retq 1318 %a1 = sext <64 x i16> %a to <64 x i32> 1319 %b1 = sext <64 x i16> %b to <64 x i32> 1320 %c = mul <64 x i32> %a1, %b1 1321 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1322 ret <64 x i32> %d 1323} 1324 1325define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { 1326; SSE2-LABEL: mulhsw_v64i16_ashr: 1327; SSE2: # %bb.0: 1328; SSE2-NEXT: movq %rdi, %rax 1329; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 1330; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 1331; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] 1332; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 1333; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] 1334; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] 1335; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 1336; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] 1337; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] 1338; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 1339; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] 1340; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] 1341; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 1342; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 1343; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1344; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 1345; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 1346; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1347; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 1348; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 1349; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1350; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 1351; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] 1352; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1353; SSE2-NEXT: psrad $16, %xmm0 1354; SSE2-NEXT: psrad $16, %xmm7 1355; SSE2-NEXT: psrad $16, %xmm1 1356; SSE2-NEXT: psrad $16, %xmm4 1357; SSE2-NEXT: psrad $16, %xmm2 1358; SSE2-NEXT: psrad $16, %xmm6 1359; SSE2-NEXT: psrad $16, %xmm3 1360; SSE2-NEXT: psrad $16, %xmm5 1361; SSE2-NEXT: psrad $16, %xmm14 1362; SSE2-NEXT: psrad $16, %xmm15 1363; SSE2-NEXT: psrad $16, %xmm12 1364; SSE2-NEXT: psrad $16, %xmm13 1365; SSE2-NEXT: psrad $16, %xmm10 1366; SSE2-NEXT: psrad $16, %xmm11 1367; SSE2-NEXT: psrad $16, %xmm9 1368; SSE2-NEXT: psrad $16, %xmm8 1369; SSE2-NEXT: movdqa %xmm8, 240(%rdi) 1370; SSE2-NEXT: movdqa %xmm9, 224(%rdi) 1371; SSE2-NEXT: movdqa %xmm11, 208(%rdi) 1372; SSE2-NEXT: movdqa %xmm10, 192(%rdi) 1373; SSE2-NEXT: movdqa %xmm13, 176(%rdi) 1374; SSE2-NEXT: movdqa %xmm12, 160(%rdi) 1375; SSE2-NEXT: movdqa %xmm15, 144(%rdi) 1376; SSE2-NEXT: movdqa %xmm14, 128(%rdi) 1377; SSE2-NEXT: movdqa %xmm5, 112(%rdi) 1378; SSE2-NEXT: movdqa %xmm3, 96(%rdi) 1379; SSE2-NEXT: movdqa %xmm6, 80(%rdi) 1380; SSE2-NEXT: movdqa %xmm2, 64(%rdi) 1381; SSE2-NEXT: movdqa %xmm4, 48(%rdi) 1382; SSE2-NEXT: movdqa %xmm1, 32(%rdi) 1383; SSE2-NEXT: movdqa %xmm7, 16(%rdi) 1384; SSE2-NEXT: movdqa %xmm0, (%rdi) 1385; SSE2-NEXT: retq 1386; 1387; SSE41-LABEL: mulhsw_v64i16_ashr: 1388; SSE41: # %bb.0: 1389; SSE41-NEXT: movq %rdi, %rax 1390; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 1391; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] 1392; SSE41-NEXT: pmovsxwd %xmm8, %xmm8 1393; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 1394; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] 1395; SSE41-NEXT: pmovsxwd %xmm9, %xmm9 1396; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 1397; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] 1398; SSE41-NEXT: pmovsxwd %xmm10, %xmm10 1399; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 1400; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] 1401; SSE41-NEXT: pmovsxwd %xmm11, %xmm11 1402; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 1403; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] 1404; SSE41-NEXT: pmovsxwd %xmm12, %xmm12 1405; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 1406; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] 1407; SSE41-NEXT: pmovsxwd %xmm13, %xmm13 1408; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 1409; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] 1410; SSE41-NEXT: pmovsxwd %xmm14, %xmm14 1411; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 1412; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] 1413; SSE41-NEXT: pmovsxwd %xmm15, %xmm15 1414; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 1415; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 1416; SSE41-NEXT: pmovsxwd %xmm2, %xmm2 1417; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 1418; SSE41-NEXT: pmovsxwd %xmm4, %xmm4 1419; SSE41-NEXT: pmovsxwd %xmm5, %xmm5 1420; SSE41-NEXT: pmovsxwd %xmm6, %xmm6 1421; SSE41-NEXT: pmovsxwd %xmm7, %xmm7 1422; SSE41-NEXT: movdqa %xmm7, 224(%rdi) 1423; SSE41-NEXT: movdqa %xmm6, 192(%rdi) 1424; SSE41-NEXT: movdqa %xmm5, 160(%rdi) 1425; SSE41-NEXT: movdqa %xmm4, 128(%rdi) 1426; SSE41-NEXT: movdqa %xmm3, 96(%rdi) 1427; SSE41-NEXT: movdqa %xmm2, 64(%rdi) 1428; SSE41-NEXT: movdqa %xmm1, 32(%rdi) 1429; SSE41-NEXT: movdqa %xmm0, (%rdi) 1430; SSE41-NEXT: movdqa %xmm15, 240(%rdi) 1431; SSE41-NEXT: movdqa %xmm14, 208(%rdi) 1432; SSE41-NEXT: movdqa %xmm13, 176(%rdi) 1433; SSE41-NEXT: movdqa %xmm12, 144(%rdi) 1434; SSE41-NEXT: movdqa %xmm11, 112(%rdi) 1435; SSE41-NEXT: movdqa %xmm10, 80(%rdi) 1436; SSE41-NEXT: movdqa %xmm9, 48(%rdi) 1437; SSE41-NEXT: movdqa %xmm8, 16(%rdi) 1438; SSE41-NEXT: retq 1439; 1440; AVX2-LABEL: mulhsw_v64i16_ashr: 1441; AVX2: # %bb.0: 1442; AVX2-NEXT: movq %rdi, %rax 1443; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 1444; AVX2-NEXT: vpmovsxwd %xmm0, %ymm4 1445; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1446; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1447; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 1448; AVX2-NEXT: vpmovsxwd %xmm1, %ymm5 1449; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1450; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 1451; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 1452; AVX2-NEXT: vpmovsxwd %xmm2, %ymm6 1453; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1454; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2 1455; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 1456; AVX2-NEXT: vpmovsxwd %xmm3, %ymm7 1457; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 1458; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3 1459; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) 1460; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi) 1461; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi) 1462; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi) 1463; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi) 1464; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 1465; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) 1466; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1467; AVX2-NEXT: vzeroupper 1468; AVX2-NEXT: retq 1469; 1470; AVX512F-LABEL: mulhsw_v64i16_ashr: 1471; AVX512F: # %bb.0: 1472; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm4 1473; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 1474; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1475; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1476; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 1477; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm5 1478; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm0 1479; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm2 1480; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1481; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1482; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0 1483; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm3 1484; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 1485; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 1486; AVX512F-NEXT: retq 1487; 1488; AVX512BW-LABEL: mulhsw_v64i16_ashr: 1489; AVX512BW: # %bb.0: 1490; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm2 1491; AVX512BW-NEXT: vpmovsxwd %ymm2, %zmm0 1492; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1493; AVX512BW-NEXT: vpmovsxwd %ymm2, %zmm4 1494; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1 1495; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm2 1496; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1497; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm3 1498; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 1499; AVX512BW-NEXT: retq 1500 %a1 = sext <64 x i16> %a to <64 x i32> 1501 %b1 = sext <64 x i16> %b to <64 x i32> 1502 %c = mul <64 x i32> %a1, %b1 1503 %d = ashr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1504 ret <64 x i32> %d 1505} 1506 1507define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { 1508; SSE2-LABEL: mulhuw_v8i16_lshr_i64: 1509; SSE2: # %bb.0: 1510; SSE2-NEXT: pxor %xmm2, %xmm2 1511; SSE2-NEXT: movdqa %xmm0, %xmm3 1512; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1513; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] 1514; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 1515; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1516; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] 1517; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] 1518; SSE2-NEXT: movdqa %xmm1, %xmm7 1519; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] 1520; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] 1521; SSE2-NEXT: pmuludq %xmm4, %xmm0 1522; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3] 1523; SSE2-NEXT: pmuludq %xmm3, %xmm4 1524; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1525; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] 1526; SSE2-NEXT: pmuludq %xmm5, %xmm2 1527; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] 1528; SSE2-NEXT: pmuludq %xmm6, %xmm3 1529; SSE2-NEXT: psrlq $16, %xmm0 1530; SSE2-NEXT: psrlq $16, %xmm4 1531; SSE2-NEXT: psrlq $16, %xmm2 1532; SSE2-NEXT: psrlq $16, %xmm3 1533; SSE2-NEXT: movdqa %xmm4, %xmm1 1534; SSE2-NEXT: retq 1535; 1536; SSE41-LABEL: mulhuw_v8i16_lshr_i64: 1537; SSE41: # %bb.0: 1538; SSE41-NEXT: pmulhuw %xmm1, %xmm0 1539; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1540; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1541; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1542; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 1543; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1544; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1545; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1546; SSE41-NEXT: movdqa %xmm4, %xmm0 1547; SSE41-NEXT: retq 1548; 1549; AVX2-LABEL: mulhuw_v8i16_lshr_i64: 1550; AVX2: # %bb.0: 1551; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 1552; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1553; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1554; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1555; AVX2-NEXT: retq 1556; 1557; AVX512-LABEL: mulhuw_v8i16_lshr_i64: 1558; AVX512: # %bb.0: 1559; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 1560; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1561; AVX512-NEXT: retq 1562 %a1 = zext <8 x i16> %a to <8 x i64> 1563 %b1 = zext <8 x i16> %b to <8 x i64> 1564 %c = mul <8 x i64> %a1, %b1 1565 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 1566 ret <8 x i64> %d 1567} 1568 1569define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { 1570; SSE2-LABEL: mulhsw_v8i16_lshr_i64: 1571; SSE2: # %bb.0: 1572; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] 1573; SSE2-NEXT: psrad $16, %xmm6 1574; SSE2-NEXT: pxor %xmm13, %xmm13 1575; SSE2-NEXT: pxor %xmm10, %xmm10 1576; SSE2-NEXT: pcmpgtd %xmm6, %xmm10 1577; SSE2-NEXT: movdqa %xmm6, %xmm8 1578; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] 1579; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3] 1580; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 1581; SSE2-NEXT: psrad $16, %xmm4 1582; SSE2-NEXT: pxor %xmm5, %xmm5 1583; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 1584; SSE2-NEXT: movdqa %xmm4, %xmm11 1585; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] 1586; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] 1587; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] 1588; SSE2-NEXT: psrad $16, %xmm7 1589; SSE2-NEXT: pxor %xmm12, %xmm12 1590; SSE2-NEXT: pcmpgtd %xmm7, %xmm12 1591; SSE2-NEXT: movdqa %xmm7, %xmm9 1592; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] 1593; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] 1594; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1595; SSE2-NEXT: psrad $16, %xmm1 1596; SSE2-NEXT: pcmpgtd %xmm1, %xmm13 1597; SSE2-NEXT: movdqa %xmm1, %xmm0 1598; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] 1599; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] 1600; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,1,3,3] 1601; SSE2-NEXT: pmuludq %xmm4, %xmm3 1602; SSE2-NEXT: pmuludq %xmm1, %xmm4 1603; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,1,3,3] 1604; SSE2-NEXT: pmuludq %xmm1, %xmm2 1605; SSE2-NEXT: paddq %xmm2, %xmm3 1606; SSE2-NEXT: psllq $32, %xmm3 1607; SSE2-NEXT: paddq %xmm4, %xmm3 1608; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] 1609; SSE2-NEXT: pmuludq %xmm11, %xmm2 1610; SSE2-NEXT: pmuludq %xmm0, %xmm11 1611; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,1,3] 1612; SSE2-NEXT: pmuludq %xmm0, %xmm1 1613; SSE2-NEXT: paddq %xmm1, %xmm2 1614; SSE2-NEXT: psllq $32, %xmm2 1615; SSE2-NEXT: paddq %xmm11, %xmm2 1616; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3] 1617; SSE2-NEXT: pmuludq %xmm6, %xmm1 1618; SSE2-NEXT: pmuludq %xmm7, %xmm6 1619; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3] 1620; SSE2-NEXT: pmuludq %xmm7, %xmm0 1621; SSE2-NEXT: paddq %xmm0, %xmm1 1622; SSE2-NEXT: psllq $32, %xmm1 1623; SSE2-NEXT: paddq %xmm6, %xmm1 1624; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] 1625; SSE2-NEXT: pmuludq %xmm8, %xmm0 1626; SSE2-NEXT: pmuludq %xmm9, %xmm8 1627; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] 1628; SSE2-NEXT: pmuludq %xmm9, %xmm4 1629; SSE2-NEXT: paddq %xmm4, %xmm0 1630; SSE2-NEXT: psllq $32, %xmm0 1631; SSE2-NEXT: paddq %xmm8, %xmm0 1632; SSE2-NEXT: psrlq $16, %xmm0 1633; SSE2-NEXT: psrlq $16, %xmm1 1634; SSE2-NEXT: psrlq $16, %xmm2 1635; SSE2-NEXT: psrlq $16, %xmm3 1636; SSE2-NEXT: retq 1637; 1638; SSE41-LABEL: mulhsw_v8i16_lshr_i64: 1639; SSE41: # %bb.0: 1640; SSE41-NEXT: pmulhw %xmm1, %xmm0 1641; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1642; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1643; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1644; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 1645; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1646; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1647; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1648; SSE41-NEXT: movdqa %xmm4, %xmm0 1649; SSE41-NEXT: retq 1650; 1651; AVX2-LABEL: mulhsw_v8i16_lshr_i64: 1652; AVX2: # %bb.0: 1653; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 1654; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1655; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1656; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1657; AVX2-NEXT: retq 1658; 1659; AVX512-LABEL: mulhsw_v8i16_lshr_i64: 1660; AVX512: # %bb.0: 1661; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 1662; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1663; AVX512-NEXT: retq 1664 %a1 = sext <8 x i16> %a to <8 x i64> 1665 %b1 = sext <8 x i16> %b to <8 x i64> 1666 %c = mul <8 x i64> %a1, %b1 1667 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 1668 ret <8 x i64> %d 1669} 1670 1671define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) { 1672; SSE2-LABEL: mulhsw_v8i16_ashr_i64: 1673; SSE2: # %bb.0: 1674; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 1675; SSE2-NEXT: psrad $16, %xmm5 1676; SSE2-NEXT: pxor %xmm13, %xmm13 1677; SSE2-NEXT: pxor %xmm10, %xmm10 1678; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 1679; SSE2-NEXT: movdqa %xmm5, %xmm8 1680; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] 1681; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] 1682; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1683; SSE2-NEXT: psrad $16, %xmm2 1684; SSE2-NEXT: pxor %xmm3, %xmm3 1685; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 1686; SSE2-NEXT: movdqa %xmm2, %xmm11 1687; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] 1688; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1689; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1690; SSE2-NEXT: psrad $16, %xmm0 1691; SSE2-NEXT: pxor %xmm12, %xmm12 1692; SSE2-NEXT: pcmpgtd %xmm0, %xmm12 1693; SSE2-NEXT: movdqa %xmm0, %xmm9 1694; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] 1695; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] 1696; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1697; SSE2-NEXT: psrad $16, %xmm1 1698; SSE2-NEXT: pcmpgtd %xmm1, %xmm13 1699; SSE2-NEXT: movdqa %xmm1, %xmm6 1700; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1] 1701; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] 1702; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,3,3] 1703; SSE2-NEXT: pmuludq %xmm2, %xmm4 1704; SSE2-NEXT: pmuludq %xmm1, %xmm2 1705; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,1,3,3] 1706; SSE2-NEXT: pmuludq %xmm1, %xmm7 1707; SSE2-NEXT: paddq %xmm7, %xmm4 1708; SSE2-NEXT: psllq $32, %xmm4 1709; SSE2-NEXT: paddq %xmm2, %xmm4 1710; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm13[0,1,1,3] 1711; SSE2-NEXT: pmuludq %xmm11, %xmm7 1712; SSE2-NEXT: pmuludq %xmm6, %xmm11 1713; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] 1714; SSE2-NEXT: pmuludq %xmm6, %xmm1 1715; SSE2-NEXT: paddq %xmm1, %xmm7 1716; SSE2-NEXT: psllq $32, %xmm7 1717; SSE2-NEXT: paddq %xmm11, %xmm7 1718; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3] 1719; SSE2-NEXT: pmuludq %xmm5, %xmm1 1720; SSE2-NEXT: pmuludq %xmm0, %xmm5 1721; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,1,3,3] 1722; SSE2-NEXT: pmuludq %xmm0, %xmm2 1723; SSE2-NEXT: paddq %xmm2, %xmm1 1724; SSE2-NEXT: psllq $32, %xmm1 1725; SSE2-NEXT: paddq %xmm5, %xmm1 1726; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] 1727; SSE2-NEXT: pmuludq %xmm8, %xmm0 1728; SSE2-NEXT: pmuludq %xmm9, %xmm8 1729; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] 1730; SSE2-NEXT: pmuludq %xmm9, %xmm2 1731; SSE2-NEXT: paddq %xmm2, %xmm0 1732; SSE2-NEXT: psllq $32, %xmm0 1733; SSE2-NEXT: paddq %xmm8, %xmm0 1734; SSE2-NEXT: movdqa %xmm0, %xmm2 1735; SSE2-NEXT: psrad $16, %xmm2 1736; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 1737; SSE2-NEXT: psrlq $16, %xmm0 1738; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1739; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1740; SSE2-NEXT: movdqa %xmm1, %xmm2 1741; SSE2-NEXT: psrad $16, %xmm2 1742; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 1743; SSE2-NEXT: psrlq $16, %xmm1 1744; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1745; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1746; SSE2-NEXT: movdqa %xmm7, %xmm2 1747; SSE2-NEXT: psrad $16, %xmm2 1748; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 1749; SSE2-NEXT: psrlq $16, %xmm7 1750; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] 1751; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1752; SSE2-NEXT: movdqa %xmm4, %xmm3 1753; SSE2-NEXT: psrad $16, %xmm3 1754; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] 1755; SSE2-NEXT: psrlq $16, %xmm4 1756; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 1757; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 1758; SSE2-NEXT: retq 1759; 1760; SSE41-LABEL: mulhsw_v8i16_ashr_i64: 1761; SSE41: # %bb.0: 1762; SSE41-NEXT: pmulhw %xmm1, %xmm0 1763; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 1764; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1765; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 1766; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 1767; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 1768; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1769; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 1770; SSE41-NEXT: movdqa %xmm4, %xmm0 1771; SSE41-NEXT: retq 1772; 1773; AVX2-LABEL: mulhsw_v8i16_ashr_i64: 1774; AVX2: # %bb.0: 1775; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 1776; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 1777; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1778; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 1779; AVX2-NEXT: retq 1780; 1781; AVX512-LABEL: mulhsw_v8i16_ashr_i64: 1782; AVX512: # %bb.0: 1783; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 1784; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 1785; AVX512-NEXT: retq 1786 %a1 = sext <8 x i16> %a to <8 x i64> 1787 %b1 = sext <8 x i16> %b to <8 x i64> 1788 %c = mul <8 x i64> %a1, %b1 1789 %d = ashr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 1790 ret <8 x i64> %d 1791} 1792