1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 3; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL32,AVX2 4; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL32,AVX512 5; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL32,AVX512,AVX512BW 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL64,AVX2-64 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL64,AVX512F-64 9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL64,AVX512F-64,AVX512BW-64 10 11;===-----------------------------------------------------------------------------=== 12; This test checks the ability to recognize a cross element pattern of 13; constants and perform the load via broadcasting a smaller constant 14; vector. 15; For example: 16; <i32 0, i32 1, i32 0, i32 1> => broadcast of the constant vector <i32 0, i32 1> 17;===-----------------------------------------------------------------------------=== 18 19define <16 x i8> @f16xi8_i16(<16 x i8> %a) { 20; AVX-LABEL: f16xi8_i16: 21; AVX: # %bb.0: 22; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 23; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 24; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 25; AVX-NEXT: retl 26; 27; ALL32-LABEL: f16xi8_i16: 28; ALL32: # %bb.0: 29; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256] 30; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 31; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 32; ALL32-NEXT: retl 33; 34; AVX-64-LABEL: f16xi8_i16: 35; AVX-64: # %bb.0: 36; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 37; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 38; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 39; AVX-64-NEXT: retq 40; 41; ALL64-LABEL: f16xi8_i16: 42; ALL64: # %bb.0: 43; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256] 44; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 45; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 46; ALL64-NEXT: retq 47 %res1 = add <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a 48 %res2 = and <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1 49 ret <16 x i8> %res2 50} 51 52 53define <16 x i8> @f16xi8_i32(<16 x i8> %a) { 54; AVX-LABEL: f16xi8_i32: 55; AVX: # %bb.0: 56; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] 57; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 58; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 59; AVX-NEXT: retl 60; 61; ALL32-LABEL: f16xi8_i32: 62; ALL32: # %bb.0: 63; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] 64; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 65; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 66; ALL32-NEXT: retl 67; 68; AVX-64-LABEL: f16xi8_i32: 69; AVX-64: # %bb.0: 70; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] 71; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 72; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 73; AVX-64-NEXT: retq 74; 75; ALL64-LABEL: f16xi8_i32: 76; ALL64: # %bb.0: 77; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] 78; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 79; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 80; ALL64-NEXT: retq 81 %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a 82 %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1 83 ret <16 x i8> %res2 84} 85 86 87define <16 x i8> @f16xi8_i64(<16 x i8> %a) { 88; AVX-LABEL: f16xi8_i64: 89; AVX: # %bb.0: 90; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [506097522914230528,506097522914230528] 91; AVX-NEXT: # xmm1 = mem[0,0] 92; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 93; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 94; AVX-NEXT: retl 95; 96; ALL32-LABEL: f16xi8_i64: 97; ALL32: # %bb.0: 98; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528] 99; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 100; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 101; ALL32-NEXT: retl 102; 103; AVX-64-LABEL: f16xi8_i64: 104; AVX-64: # %bb.0: 105; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [506097522914230528,506097522914230528] 106; AVX-64-NEXT: # xmm1 = mem[0,0] 107; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 108; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 109; AVX-64-NEXT: retq 110; 111; ALL64-LABEL: f16xi8_i64: 112; ALL64: # %bb.0: 113; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528] 114; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 115; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 116; ALL64-NEXT: retq 117 %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a 118 %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1 119 ret <16 x i8> %res2 120} 121 122 123define <32 x i8> @f32xi8_i16(<32 x i8> %a) { 124; AVX-LABEL: f32xi8_i16: 125; AVX: # %bb.0: 126; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 127; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 128; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 129; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 130; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 131; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 132; AVX-NEXT: retl 133; 134; ALL32-LABEL: f32xi8_i16: 135; ALL32: # %bb.0: 136; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] 137; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 138; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 139; ALL32-NEXT: retl 140; 141; AVX-64-LABEL: f32xi8_i16: 142; AVX-64: # %bb.0: 143; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 144; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 145; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 146; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 147; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 148; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 149; AVX-64-NEXT: retq 150; 151; ALL64-LABEL: f32xi8_i16: 152; ALL64: # %bb.0: 153; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] 154; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 155; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 156; ALL64-NEXT: retq 157 %res1 = add <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a 158 %res2 = and <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1 159 ret <32 x i8> %res2 160} 161 162 163define <32 x i8> @f32xi8_i32(<32 x i8> %a) { 164; AVX-LABEL: f32xi8_i32: 165; AVX: # %bb.0: 166; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 167; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [50462976,50462976,50462976,50462976] 168; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 169; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 170; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 171; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 172; AVX-NEXT: retl 173; 174; ALL32-LABEL: f32xi8_i32: 175; ALL32: # %bb.0: 176; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] 177; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 178; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 179; ALL32-NEXT: retl 180; 181; AVX-64-LABEL: f32xi8_i32: 182; AVX-64: # %bb.0: 183; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 184; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [50462976,50462976,50462976,50462976] 185; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 186; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 187; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 188; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 189; AVX-64-NEXT: retq 190; 191; ALL64-LABEL: f32xi8_i32: 192; ALL64: # %bb.0: 193; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] 194; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 195; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 196; ALL64-NEXT: retq 197 %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a 198 %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1 199 ret <32 x i8> %res2 200} 201 202 203define <32 x i8> @f32xi8_i64(<32 x i8> %a) { 204; AVX-LABEL: f32xi8_i64: 205; AVX: # %bb.0: 206; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 207; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [506097522914230528,506097522914230528] 208; AVX-NEXT: # xmm2 = mem[0,0] 209; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 210; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 211; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 212; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 213; AVX-NEXT: retl 214; 215; ALL32-LABEL: f32xi8_i64: 216; ALL32: # %bb.0: 217; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] 218; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 219; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 220; ALL32-NEXT: retl 221; 222; AVX-64-LABEL: f32xi8_i64: 223; AVX-64: # %bb.0: 224; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 225; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [506097522914230528,506097522914230528] 226; AVX-64-NEXT: # xmm2 = mem[0,0] 227; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 228; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 229; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 230; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 231; AVX-64-NEXT: retq 232; 233; ALL64-LABEL: f32xi8_i64: 234; ALL64: # %bb.0: 235; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] 236; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 237; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 238; ALL64-NEXT: retq 239 %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a 240 %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1 241 ret <32 x i8> %res2 242} 243 244 245define <32 x i8> @f32xi8_i128(<32 x i8> %a) { 246; AVX-LABEL: f32xi8_i128: 247; AVX: # %bb.0: 248; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 249; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 250; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 251; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 252; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 253; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 254; AVX-NEXT: retl 255; 256; ALL32-LABEL: f32xi8_i128: 257; ALL32: # %bb.0: 258; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 259; ALL32-NEXT: # ymm1 = mem[0,1,0,1] 260; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 261; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 262; ALL32-NEXT: retl 263; 264; AVX-64-LABEL: f32xi8_i128: 265; AVX-64: # %bb.0: 266; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 267; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 268; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 269; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 270; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 271; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 272; AVX-64-NEXT: retq 273; 274; ALL64-LABEL: f32xi8_i128: 275; ALL64: # %bb.0: 276; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 277; ALL64-NEXT: # ymm1 = mem[0,1,0,1] 278; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 279; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 280; ALL64-NEXT: retq 281 %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a 282 %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1 283 ret <32 x i8> %res2 284} 285 286 287define <64 x i8> @f64xi8_i16(<64 x i8> %a) { 288; AVX-LABEL: f64xi8_i16: 289; AVX: # %bb.0: 290; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 291; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 292; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 293; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 294; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 295; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 296; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 297; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 298; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 299; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 300; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 301; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 302; AVX-NEXT: retl 303; 304; AVX2-LABEL: f64xi8_i16: 305; AVX2: # %bb.0: 306; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] 307; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 308; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 309; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 310; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 311; AVX2-NEXT: retl 312; 313; AVX512BW-LABEL: f64xi8_i16: 314; AVX512BW: # %bb.0: 315; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] 316; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 317; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 318; AVX512BW-NEXT: retl 319; 320; AVX-64-LABEL: f64xi8_i16: 321; AVX-64: # %bb.0: 322; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 323; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 324; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 325; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 326; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 327; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 328; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 329; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 330; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 331; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 332; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 333; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 334; AVX-64-NEXT: retq 335; 336; AVX2-64-LABEL: f64xi8_i16: 337; AVX2-64: # %bb.0: 338; AVX2-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] 339; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 340; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 341; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 342; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 343; AVX2-64-NEXT: retq 344; 345; AVX512BW-64-LABEL: f64xi8_i16: 346; AVX512BW-64: # %bb.0: 347; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] 348; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 349; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 350; AVX512BW-64-NEXT: retq 351 %res1 = add <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a 352 %res2 = and <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1 353 ret <64 x i8> %res2 354} 355 356 357define <64 x i8> @f64i8_i32(<64 x i8> %a) { 358; AVX-LABEL: f64i8_i32: 359; AVX: # %bb.0: 360; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] 361; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 362; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 363; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 364; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 365; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 366; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 367; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 368; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 369; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 370; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 371; AVX-NEXT: retl 372; 373; AVX2-LABEL: f64i8_i32: 374; AVX2: # %bb.0: 375; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] 376; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 377; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 378; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 379; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 380; AVX2-NEXT: retl 381; 382; AVX512BW-LABEL: f64i8_i32: 383; AVX512BW: # %bb.0: 384; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] 385; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 386; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 387; AVX512BW-NEXT: retl 388; 389; AVX-64-LABEL: f64i8_i32: 390; AVX-64: # %bb.0: 391; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] 392; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 393; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 394; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 395; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 396; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 397; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 398; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 399; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 400; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 401; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 402; AVX-64-NEXT: retq 403; 404; AVX2-64-LABEL: f64i8_i32: 405; AVX2-64: # %bb.0: 406; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] 407; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 408; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 409; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 410; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 411; AVX2-64-NEXT: retq 412; 413; AVX512BW-64-LABEL: f64i8_i32: 414; AVX512BW-64: # %bb.0: 415; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] 416; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 417; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 418; AVX512BW-64-NEXT: retq 419 %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a 420 %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1 421 ret <64 x i8> %res2 422} 423 424 425define <64 x i8> @f64xi8_i64(<64 x i8> %a) { 426; AVX-LABEL: f64xi8_i64: 427; AVX: # %bb.0: 428; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] 429; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 430; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 431; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 432; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 433; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 434; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 435; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 436; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 437; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 438; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 439; AVX-NEXT: retl 440; 441; AVX2-LABEL: f64xi8_i64: 442; AVX2: # %bb.0: 443; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] 444; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 445; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 446; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 447; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 448; AVX2-NEXT: retl 449; 450; AVX512BW-LABEL: f64xi8_i64: 451; AVX512BW: # %bb.0: 452; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528] 453; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 454; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 455; AVX512BW-NEXT: retl 456; 457; AVX-64-LABEL: f64xi8_i64: 458; AVX-64: # %bb.0: 459; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] 460; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 461; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 462; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 463; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 464; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 465; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 466; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 467; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 468; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 469; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 470; AVX-64-NEXT: retq 471; 472; AVX2-64-LABEL: f64xi8_i64: 473; AVX2-64: # %bb.0: 474; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] 475; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 476; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 477; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 478; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 479; AVX2-64-NEXT: retq 480; 481; AVX512BW-64-LABEL: f64xi8_i64: 482; AVX512BW-64: # %bb.0: 483; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528] 484; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 485; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 486; AVX512BW-64-NEXT: retq 487 %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a 488 %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1 489 ret <64 x i8> %res2 490} 491 492 493define <64 x i8> @f64xi8_i128(<64 x i8> %a) { 494; AVX-LABEL: f64xi8_i128: 495; AVX: # %bb.0: 496; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 497; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 498; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 499; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 500; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 501; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 502; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 503; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 504; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 505; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 506; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 507; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 508; AVX-NEXT: retl 509; 510; AVX2-LABEL: f64xi8_i128: 511; AVX2: # %bb.0: 512; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 513; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 514; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 515; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 516; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 517; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 518; AVX2-NEXT: retl 519; 520; AVX512BW-LABEL: f64xi8_i128: 521; AVX512BW: # %bb.0: 522; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 523; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 524; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 525; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 526; AVX512BW-NEXT: retl 527; 528; AVX-64-LABEL: f64xi8_i128: 529; AVX-64: # %bb.0: 530; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 531; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 532; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 533; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 534; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 535; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 536; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 537; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 538; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 539; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 540; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 541; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 542; AVX-64-NEXT: retq 543; 544; AVX2-64-LABEL: f64xi8_i128: 545; AVX2-64: # %bb.0: 546; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 547; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] 548; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 549; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 550; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 551; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 552; AVX2-64-NEXT: retq 553; 554; AVX512BW-64-LABEL: f64xi8_i128: 555; AVX512BW-64: # %bb.0: 556; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 557; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 558; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 559; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 560; AVX512BW-64-NEXT: retq 561 %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a 562 %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1 563 ret <64 x i8> %res2 564} 565 566 567define <64 x i8> @f64xi8_i256(<64 x i8> %a) { 568; AVX-LABEL: f64xi8_i256: 569; AVX: # %bb.0: 570; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 571; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] 572; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 573; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 574; AVX-NEXT: vpaddb %xmm4, %xmm1, %xmm1 575; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 576; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 577; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 578; AVX-NEXT: vpaddb %xmm4, %xmm0, %xmm0 579; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 580; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] 581; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 582; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 583; AVX-NEXT: retl 584; 585; AVX2-LABEL: f64xi8_i256: 586; AVX2: # %bb.0: 587; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] 588; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 589; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 590; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 591; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 592; AVX2-NEXT: retl 593; 594; AVX512BW-LABEL: f64xi8_i256: 595; AVX512BW: # %bb.0: 596; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] 597; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 598; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 599; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 600; AVX512BW-NEXT: retl 601; 602; AVX-64-LABEL: f64xi8_i256: 603; AVX-64: # %bb.0: 604; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 605; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] 606; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 607; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 608; AVX-64-NEXT: vpaddb %xmm4, %xmm1, %xmm1 609; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 610; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 611; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 612; AVX-64-NEXT: vpaddb %xmm4, %xmm0, %xmm0 613; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 614; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] 615; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 616; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 617; AVX-64-NEXT: retq 618; 619; AVX2-64-LABEL: f64xi8_i256: 620; AVX2-64: # %bb.0: 621; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] 622; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 623; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 624; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 625; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 626; AVX2-64-NEXT: retq 627; 628; AVX512BW-64-LABEL: f64xi8_i256: 629; AVX512BW-64: # %bb.0: 630; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] 631; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 632; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 633; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 634; AVX512BW-64-NEXT: retq 635 %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a 636 %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1 637 ret <64 x i8> %res2 638} 639 640 641define <8 x i16> @f8xi16_i32(<8 x i16> %a) { 642; AVX-LABEL: f8xi16_i32: 643; AVX: # %bb.0: 644; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [65536,65536,65536,65536] 645; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 646; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 647; AVX-NEXT: retl 648; 649; ALL32-LABEL: f8xi16_i32: 650; ALL32: # %bb.0: 651; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536] 652; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 653; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 654; ALL32-NEXT: retl 655; 656; AVX-64-LABEL: f8xi16_i32: 657; AVX-64: # %bb.0: 658; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [65536,65536,65536,65536] 659; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 660; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 661; AVX-64-NEXT: retq 662; 663; ALL64-LABEL: f8xi16_i32: 664; ALL64: # %bb.0: 665; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536] 666; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 667; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 668; ALL64-NEXT: retq 669 %res1 = add <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a 670 %res2 = and <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1 671 ret <8 x i16> %res2 672} 673 674 675define <8 x i16> @f8xi16_i64(<8 x i16> %a) { 676; AVX-LABEL: f8xi16_i64: 677; AVX: # %bb.0: 678; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [844433520132096,844433520132096] 679; AVX-NEXT: # xmm1 = mem[0,0] 680; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 681; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 682; AVX-NEXT: retl 683; 684; ALL32-LABEL: f8xi16_i64: 685; ALL32: # %bb.0: 686; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096] 687; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 688; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 689; ALL32-NEXT: retl 690; 691; AVX-64-LABEL: f8xi16_i64: 692; AVX-64: # %bb.0: 693; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [844433520132096,844433520132096] 694; AVX-64-NEXT: # xmm1 = mem[0,0] 695; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 696; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 697; AVX-64-NEXT: retq 698; 699; ALL64-LABEL: f8xi16_i64: 700; ALL64: # %bb.0: 701; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096] 702; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 703; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 704; ALL64-NEXT: retq 705 %res1 = add <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a 706 %res2 = and <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1 707 ret <8 x i16> %res2 708} 709 710 711define <16 x i16> @f16xi16_i32(<16 x i16> %a) { 712; AVX-LABEL: f16xi16_i32: 713; AVX: # %bb.0: 714; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 715; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [65536,65536,65536,65536] 716; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 717; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 718; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 719; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 720; AVX-NEXT: retl 721; 722; ALL32-LABEL: f16xi16_i32: 723; ALL32: # %bb.0: 724; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536] 725; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 726; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 727; ALL32-NEXT: retl 728; 729; AVX-64-LABEL: f16xi16_i32: 730; AVX-64: # %bb.0: 731; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 732; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [65536,65536,65536,65536] 733; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 734; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 735; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 736; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 737; AVX-64-NEXT: retq 738; 739; ALL64-LABEL: f16xi16_i32: 740; ALL64: # %bb.0: 741; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536] 742; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 743; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 744; ALL64-NEXT: retq 745 %res1 = add <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a 746 %res2 = and <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1 747 ret <16 x i16> %res2 748} 749 750 751define <16 x i16> @f16xi16_i64(<16 x i16> %a) { 752; AVX-LABEL: f16xi16_i64: 753; AVX: # %bb.0: 754; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 755; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [844433520132096,844433520132096] 756; AVX-NEXT: # xmm2 = mem[0,0] 757; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 758; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 759; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 760; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 761; AVX-NEXT: retl 762; 763; ALL32-LABEL: f16xi16_i64: 764; ALL32: # %bb.0: 765; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096] 766; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 767; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 768; ALL32-NEXT: retl 769; 770; AVX-64-LABEL: f16xi16_i64: 771; AVX-64: # %bb.0: 772; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 773; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [844433520132096,844433520132096] 774; AVX-64-NEXT: # xmm2 = mem[0,0] 775; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 776; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 777; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 778; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 779; AVX-64-NEXT: retq 780; 781; ALL64-LABEL: f16xi16_i64: 782; ALL64: # %bb.0: 783; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096] 784; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 785; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 786; ALL64-NEXT: retq 787 %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a 788 %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1 789 ret <16 x i16> %res2 790} 791 792 793define <16 x i16> @f16xi16_i128(<16 x i16> %a) { 794; AVX-LABEL: f16xi16_i128: 795; AVX: # %bb.0: 796; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 797; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 798; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 799; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 800; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 801; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 802; AVX-NEXT: retl 803; 804; ALL32-LABEL: f16xi16_i128: 805; ALL32: # %bb.0: 806; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 807; ALL32-NEXT: # ymm1 = mem[0,1,0,1] 808; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 809; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 810; ALL32-NEXT: retl 811; 812; AVX-64-LABEL: f16xi16_i128: 813; AVX-64: # %bb.0: 814; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 815; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 816; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 817; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 818; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 819; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 820; AVX-64-NEXT: retq 821; 822; ALL64-LABEL: f16xi16_i128: 823; ALL64: # %bb.0: 824; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 825; ALL64-NEXT: # ymm1 = mem[0,1,0,1] 826; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 827; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 828; ALL64-NEXT: retq 829 %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a 830 %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1 831 ret <16 x i16> %res2 832} 833 834 835define <32 x i16> @f32xi16_i32(<32 x i16> %a) { 836; AVX-LABEL: f32xi16_i32: 837; AVX: # %bb.0: 838; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] 839; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 840; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 841; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 842; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 843; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 844; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 845; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 846; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 847; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 848; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 849; AVX-NEXT: retl 850; 851; AVX2-LABEL: f32xi16_i32: 852; AVX2: # %bb.0: 853; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] 854; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 855; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 856; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 857; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 858; AVX2-NEXT: retl 859; 860; AVX512BW-LABEL: f32xi16_i32: 861; AVX512BW: # %bb.0: 862; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536] 863; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 864; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 865; AVX512BW-NEXT: retl 866; 867; AVX-64-LABEL: f32xi16_i32: 868; AVX-64: # %bb.0: 869; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] 870; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 871; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 872; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 873; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 874; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 875; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 876; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 877; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 878; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 879; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 880; AVX-64-NEXT: retq 881; 882; AVX2-64-LABEL: f32xi16_i32: 883; AVX2-64: # %bb.0: 884; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] 885; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 886; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 887; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 888; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 889; AVX2-64-NEXT: retq 890; 891; AVX512BW-64-LABEL: f32xi16_i32: 892; AVX512BW-64: # %bb.0: 893; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536] 894; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 895; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 896; AVX512BW-64-NEXT: retq 897 %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a 898 %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1 899 ret <32 x i16> %res2 900} 901 902 903define <32 x i16> @f32xi16_i64(<32 x i16> %a) { 904; AVX-LABEL: f32xi16_i64: 905; AVX: # %bb.0: 906; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] 907; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 908; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 909; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 910; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 911; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 912; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 913; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 914; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 915; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 916; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 917; AVX-NEXT: retl 918; 919; AVX2-LABEL: f32xi16_i64: 920; AVX2: # %bb.0: 921; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] 922; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 923; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 924; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 925; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 926; AVX2-NEXT: retl 927; 928; AVX512BW-LABEL: f32xi16_i64: 929; AVX512BW: # %bb.0: 930; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096] 931; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 932; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 933; AVX512BW-NEXT: retl 934; 935; AVX-64-LABEL: f32xi16_i64: 936; AVX-64: # %bb.0: 937; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] 938; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 939; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 940; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 941; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 942; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 943; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 944; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 945; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 946; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 947; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 948; AVX-64-NEXT: retq 949; 950; AVX2-64-LABEL: f32xi16_i64: 951; AVX2-64: # %bb.0: 952; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] 953; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 954; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 955; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 956; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 957; AVX2-64-NEXT: retq 958; 959; AVX512BW-64-LABEL: f32xi16_i64: 960; AVX512BW-64: # %bb.0: 961; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096] 962; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 963; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 964; AVX512BW-64-NEXT: retq 965 %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a 966 %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1 967 ret <32 x i16> %res2 968} 969 970 971define <32 x i16> @f32xi16_i128(<32 x i16> %a) { 972; AVX-LABEL: f32xi16_i128: 973; AVX: # %bb.0: 974; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 975; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] 976; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 977; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 978; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 979; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 980; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 981; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 982; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 983; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 984; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 985; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 986; AVX-NEXT: retl 987; 988; AVX2-LABEL: f32xi16_i128: 989; AVX2: # %bb.0: 990; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 991; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 992; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 993; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 994; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 995; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 996; AVX2-NEXT: retl 997; 998; AVX512BW-LABEL: f32xi16_i128: 999; AVX512BW: # %bb.0: 1000; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1001; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1002; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 1003; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1004; AVX512BW-NEXT: retl 1005; 1006; AVX-64-LABEL: f32xi16_i128: 1007; AVX-64: # %bb.0: 1008; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 1009; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] 1010; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1011; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 1012; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1013; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 1014; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1015; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 1016; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1017; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 1018; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 1019; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 1020; AVX-64-NEXT: retq 1021; 1022; AVX2-64-LABEL: f32xi16_i128: 1023; AVX2-64: # %bb.0: 1024; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1025; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] 1026; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 1027; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 1028; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 1029; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 1030; AVX2-64-NEXT: retq 1031; 1032; AVX512BW-64-LABEL: f32xi16_i128: 1033; AVX512BW-64: # %bb.0: 1034; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1035; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1036; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 1037; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 1038; AVX512BW-64-NEXT: retq 1039 %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a 1040 %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1 1041 ret <32 x i16> %res2 1042} 1043 1044 1045define <32 x i16> @f32xi16_i256(<32 x i16> %a) { 1046; AVX-LABEL: f32xi16_i256: 1047; AVX: # %bb.0: 1048; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 1049; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] 1050; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1051; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] 1052; AVX-NEXT: vpaddw %xmm4, %xmm1, %xmm1 1053; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1054; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 1055; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1056; AVX-NEXT: vpaddw %xmm4, %xmm0, %xmm0 1057; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1058; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1059; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 1060; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 1061; AVX-NEXT: retl 1062; 1063; AVX2-LABEL: f32xi16_i256: 1064; AVX2: # %bb.0: 1065; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1066; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 1067; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 1068; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1069; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1070; AVX2-NEXT: retl 1071; 1072; AVX512BW-LABEL: f32xi16_i256: 1073; AVX512BW: # %bb.0: 1074; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1075; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1076; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 1077; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1078; AVX512BW-NEXT: retl 1079; 1080; AVX-64-LABEL: f32xi16_i256: 1081; AVX-64: # %bb.0: 1082; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 1083; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] 1084; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1085; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] 1086; AVX-64-NEXT: vpaddw %xmm4, %xmm1, %xmm1 1087; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1088; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 1089; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1090; AVX-64-NEXT: vpaddw %xmm4, %xmm0, %xmm0 1091; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1092; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1093; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 1094; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 1095; AVX-64-NEXT: retq 1096; 1097; AVX2-64-LABEL: f32xi16_i256: 1098; AVX2-64: # %bb.0: 1099; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1100; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 1101; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 1102; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 1103; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 1104; AVX2-64-NEXT: retq 1105; 1106; AVX512BW-64-LABEL: f32xi16_i256: 1107; AVX512BW-64: # %bb.0: 1108; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1109; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1110; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 1111; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 1112; AVX512BW-64-NEXT: retq 1113 %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a 1114 %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1 1115 ret <32 x i16> %res2 1116} 1117 1118 1119define <4 x i32> @f4xi32_i64(<4 x i32> %a) { 1120; AVX-LABEL: f4xi32_i64: 1121; AVX: # %bb.0: 1122; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967296,4294967296] 1123; AVX-NEXT: # xmm1 = mem[0,0] 1124; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1125; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1126; AVX-NEXT: retl 1127; 1128; ALL32-LABEL: f4xi32_i64: 1129; ALL32: # %bb.0: 1130; ALL32-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296] 1131; ALL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1132; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 1133; ALL32-NEXT: retl 1134; 1135; AVX-64-LABEL: f4xi32_i64: 1136; AVX-64: # %bb.0: 1137; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [4294967296,4294967296] 1138; AVX-64-NEXT: # xmm1 = mem[0,0] 1139; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1140; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 1141; AVX-64-NEXT: retq 1142; 1143; ALL64-LABEL: f4xi32_i64: 1144; ALL64: # %bb.0: 1145; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296] 1146; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1147; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 1148; ALL64-NEXT: retq 1149 %res1 = add <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %a 1150 %res2 = and <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %res1 1151 ret <4 x i32> %res2 1152} 1153 1154 1155define <8 x i32> @f8xi32_i64(<8 x i32> %a) { 1156; AVX-LABEL: f8xi32_i64: 1157; AVX: # %bb.0: 1158; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1159; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [4294967296,4294967296] 1160; AVX-NEXT: # xmm2 = mem[0,0] 1161; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1162; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1163; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1164; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 1165; AVX-NEXT: retl 1166; 1167; ALL32-LABEL: f8xi32_i64: 1168; ALL32: # %bb.0: 1169; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296] 1170; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1171; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 1172; ALL32-NEXT: retl 1173; 1174; AVX-64-LABEL: f8xi32_i64: 1175; AVX-64: # %bb.0: 1176; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 1177; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [4294967296,4294967296] 1178; AVX-64-NEXT: # xmm2 = mem[0,0] 1179; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1180; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1181; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1182; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1183; AVX-64-NEXT: retq 1184; 1185; ALL64-LABEL: f8xi32_i64: 1186; ALL64: # %bb.0: 1187; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296] 1188; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1189; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 1190; ALL64-NEXT: retq 1191 %res1 = add <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a 1192 %res2 = and <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1 1193 ret <8 x i32> %res2 1194} 1195 1196 1197define <8 x i32> @f8xi32_i128(<8 x i32> %a) { 1198; AVX-LABEL: f8xi32_i128: 1199; AVX: # %bb.0: 1200; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1201; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] 1202; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1203; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1204; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1205; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 1206; AVX-NEXT: retl 1207; 1208; ALL32-LABEL: f8xi32_i128: 1209; ALL32: # %bb.0: 1210; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] 1211; ALL32-NEXT: # ymm1 = mem[0,1,0,1] 1212; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1213; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 1214; ALL32-NEXT: retl 1215; 1216; AVX-64-LABEL: f8xi32_i128: 1217; AVX-64: # %bb.0: 1218; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 1219; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] 1220; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1221; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1222; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1223; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1224; AVX-64-NEXT: retq 1225; 1226; ALL64-LABEL: f8xi32_i128: 1227; ALL64: # %bb.0: 1228; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] 1229; ALL64-NEXT: # ymm1 = mem[0,1,0,1] 1230; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1231; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 1232; ALL64-NEXT: retq 1233 %res1 = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a 1234 %res2 = and <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1 1235 ret <8 x i32> %res2 1236} 1237 1238 1239define <16 x i32> @f16xi32_i64(<16 x i32> %a) { 1240; AVX-LABEL: f16xi32_i64: 1241; AVX: # %bb.0: 1242; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] 1243; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 1244; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 1245; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1246; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1247; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 1248; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 1249; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1250; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1251; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 1252; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 1253; AVX-NEXT: retl 1254; 1255; AVX2-LABEL: f16xi32_i64: 1256; AVX2: # %bb.0: 1257; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] 1258; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 1259; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 1260; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1261; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1262; AVX2-NEXT: retl 1263; 1264; AVX512-LABEL: f16xi32_i64: 1265; AVX512: # %bb.0: 1266; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296] 1267; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1268; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 1269; AVX512-NEXT: retl 1270; 1271; AVX-64-LABEL: f16xi32_i64: 1272; AVX-64: # %bb.0: 1273; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] 1274; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 1275; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 1276; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1277; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1278; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 1279; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 1280; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1281; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1282; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 1283; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 1284; AVX-64-NEXT: retq 1285; 1286; AVX2-64-LABEL: f16xi32_i64: 1287; AVX2-64: # %bb.0: 1288; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] 1289; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 1290; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 1291; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 1292; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 1293; AVX2-64-NEXT: retq 1294; 1295; AVX512F-64-LABEL: f16xi32_i64: 1296; AVX512F-64: # %bb.0: 1297; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296] 1298; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1299; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 1300; AVX512F-64-NEXT: retq 1301 %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a 1302 %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1 1303 ret <16 x i32> %res2 1304} 1305 1306 1307define <16 x i32> @f16xi32_i128(<16 x i32> %a) { 1308; AVX-LABEL: f16xi32_i128: 1309; AVX: # %bb.0: 1310; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 1311; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] 1312; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1313; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 1314; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1315; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 1316; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1317; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1318; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1319; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 1320; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 1321; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 1322; AVX-NEXT: retl 1323; 1324; AVX2-LABEL: f16xi32_i128: 1325; AVX2: # %bb.0: 1326; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] 1327; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 1328; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 1329; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 1330; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1331; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1332; AVX2-NEXT: retl 1333; 1334; AVX512-LABEL: f16xi32_i128: 1335; AVX512: # %bb.0: 1336; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1337; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1338; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1339; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 1340; AVX512-NEXT: retl 1341; 1342; AVX-64-LABEL: f16xi32_i128: 1343; AVX-64: # %bb.0: 1344; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 1345; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] 1346; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1347; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 1348; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1349; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 1350; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1351; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1352; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1353; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 1354; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 1355; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 1356; AVX-64-NEXT: retq 1357; 1358; AVX2-64-LABEL: f16xi32_i128: 1359; AVX2-64: # %bb.0: 1360; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] 1361; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] 1362; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 1363; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 1364; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 1365; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 1366; AVX2-64-NEXT: retq 1367; 1368; AVX512F-64-LABEL: f16xi32_i128: 1369; AVX512F-64: # %bb.0: 1370; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1371; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1372; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1373; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 1374; AVX512F-64-NEXT: retq 1375 %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a 1376 %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1 1377 ret <16 x i32> %res2 1378} 1379 1380 1381define <4 x i64> @f4xi64_i128(<4 x i64> %a) { 1382; AVX-LABEL: f4xi64_i128: 1383; AVX: # %bb.0: 1384; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1385; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0] 1386; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 1387; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 1388; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1389; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 1390; AVX-NEXT: retl 1391; 1392; ALL32-LABEL: f4xi64_i128: 1393; ALL32: # %bb.0: 1394; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] 1395; ALL32-NEXT: # ymm1 = mem[0,1,0,1] 1396; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1397; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 1398; ALL32-NEXT: retl 1399; 1400; AVX-64-LABEL: f4xi64_i128: 1401; AVX-64: # %bb.0: 1402; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 1403; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] 1404; AVX-64-NEXT: vpaddq %xmm2, %xmm1, %xmm1 1405; AVX-64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 1406; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1407; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1408; AVX-64-NEXT: retq 1409; 1410; ALL64-LABEL: f4xi64_i128: 1411; ALL64: # %bb.0: 1412; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1] 1413; ALL64-NEXT: # ymm1 = mem[0,1,0,1] 1414; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1415; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 1416; ALL64-NEXT: retq 1417 %res1 = add <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %a 1418 %res2 = and <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %res1 1419 ret <4 x i64> %res2 1420} 1421 1422 1423define <8 x i64> @f8xi64_i128(<8 x i64> %a) { 1424; AVX-LABEL: f8xi64_i128: 1425; AVX: # %bb.0: 1426; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 1427; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,0] 1428; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1429; AVX-NEXT: vpaddq %xmm3, %xmm1, %xmm1 1430; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1431; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 1432; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1433; AVX-NEXT: vpaddq %xmm3, %xmm0, %xmm0 1434; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1435; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 1436; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 1437; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 1438; AVX-NEXT: retl 1439; 1440; AVX2-LABEL: f8xi64_i128: 1441; AVX2: # %bb.0: 1442; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] 1443; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 1444; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 1445; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 1446; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1447; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1448; AVX2-NEXT: retl 1449; 1450; AVX512-LABEL: f8xi64_i128: 1451; AVX512: # %bb.0: 1452; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0] 1453; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1454; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 1455; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 1456; AVX512-NEXT: retl 1457; 1458; AVX-64-LABEL: f8xi64_i128: 1459; AVX-64: # %bb.0: 1460; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 1461; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] 1462; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1463; AVX-64-NEXT: vpaddq %xmm3, %xmm1, %xmm1 1464; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1465; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 1466; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1467; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0 1468; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1469; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] 1470; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] 1471; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 1472; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 1473; AVX-64-NEXT: retq 1474; 1475; AVX2-64-LABEL: f8xi64_i128: 1476; AVX2-64: # %bb.0: 1477; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1] 1478; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] 1479; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 1480; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 1481; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 1482; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 1483; AVX2-64-NEXT: retq 1484; 1485; AVX512F-64-LABEL: f8xi64_i128: 1486; AVX512F-64: # %bb.0: 1487; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] 1488; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1489; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 1490; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 1491; AVX512F-64-NEXT: retq 1492 %res1 = add <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %a 1493 %res2 = and <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %res1 1494 ret <8 x i64> %res2 1495} 1496 1497 1498define <8 x i64> @f8xi64_i256(<8 x i64> %a) { 1499; AVX-LABEL: f8xi64_i256: 1500; AVX: # %bb.0: 1501; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 1502; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,0,3,0] 1503; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1504; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,1,0] 1505; AVX-NEXT: vpaddq %xmm4, %xmm1, %xmm1 1506; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1507; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 1508; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1509; AVX-NEXT: vpaddq %xmm4, %xmm0, %xmm0 1510; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1511; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] 1512; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 1513; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 1514; AVX-NEXT: retl 1515; 1516; AVX2-LABEL: f8xi64_i256: 1517; AVX2: # %bb.0: 1518; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] 1519; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 1520; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 1521; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1522; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1523; AVX2-NEXT: retl 1524; 1525; AVX512-LABEL: f8xi64_i256: 1526; AVX512: # %bb.0: 1527; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0] 1528; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1529; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 1530; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 1531; AVX512-NEXT: retl 1532; 1533; AVX-64-LABEL: f8xi64_i256: 1534; AVX-64: # %bb.0: 1535; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 1536; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] 1537; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1538; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] 1539; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1 1540; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1541; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 1542; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1543; AVX-64-NEXT: vpaddq %xmm4, %xmm0, %xmm0 1544; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1545; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3] 1546; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 1547; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 1548; AVX-64-NEXT: retq 1549; 1550; AVX2-64-LABEL: f8xi64_i256: 1551; AVX2-64: # %bb.0: 1552; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] 1553; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 1554; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 1555; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 1556; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 1557; AVX2-64-NEXT: retq 1558; 1559; AVX512F-64-LABEL: f8xi64_i256: 1560; AVX512F-64: # %bb.0: 1561; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3] 1562; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1563; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 1564; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 1565; AVX512F-64-NEXT: retq 1566 %res1 = add <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %a 1567 %res2 = and <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %res1 1568 ret <8 x i64> %res2 1569} 1570 1571 1572define <4 x float> @f4xf32_f64(<4 x float> %a) { 1573; AVX-LABEL: f4xf32_f64: 1574; AVX: # %bb.0: 1575; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] 1576; AVX-NEXT: # xmm1 = mem[0,0] 1577; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 1578; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 1579; AVX-NEXT: retl 1580; 1581; ALL32-LABEL: f4xf32_f64: 1582; ALL32: # %bb.0: 1583; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] 1584; ALL32-NEXT: # xmm1 = mem[0,0] 1585; ALL32-NEXT: vaddps %xmm1, %xmm0, %xmm0 1586; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0 1587; ALL32-NEXT: retl 1588; 1589; AVX-64-LABEL: f4xf32_f64: 1590; AVX-64: # %bb.0: 1591; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] 1592; AVX-64-NEXT: # xmm1 = mem[0,0] 1593; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0 1594; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0 1595; AVX-64-NEXT: retq 1596; 1597; ALL64-LABEL: f4xf32_f64: 1598; ALL64: # %bb.0: 1599; ALL64-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] 1600; ALL64-NEXT: # xmm1 = mem[0,0] 1601; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0 1602; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0 1603; ALL64-NEXT: retq 1604 %res1 = fadd <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %a 1605 %res2 = fdiv <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %res1 1606 ret <4 x float> %res2 1607} 1608 1609 1610define <8 x float> @f8xf32_f64(<8 x float> %a) { 1611; AVX-LABEL: f8xf32_f64: 1612; AVX: # %bb.0: 1613; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1614; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1615; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 1616; AVX-NEXT: retl 1617; 1618; ALL32-LABEL: f8xf32_f64: 1619; ALL32: # %bb.0: 1620; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1621; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 1622; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 1623; ALL32-NEXT: retl 1624; 1625; AVX-64-LABEL: f8xf32_f64: 1626; AVX-64: # %bb.0: 1627; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1628; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 1629; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 1630; AVX-64-NEXT: retq 1631; 1632; ALL64-LABEL: f8xf32_f64: 1633; ALL64: # %bb.0: 1634; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1635; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 1636; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 1637; ALL64-NEXT: retq 1638 %res1 = fadd <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a 1639 %res2 = fdiv <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1 1640 ret <8 x float> %res2 1641} 1642 1643 1644define <8 x float> @f8xf32_f128(<8 x float> %a) { 1645; AVX-LABEL: f8xf32_f128: 1646; AVX: # %bb.0: 1647; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1648; AVX-NEXT: # ymm1 = mem[0,1,0,1] 1649; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1650; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 1651; AVX-NEXT: retl 1652; 1653; ALL32-LABEL: f8xf32_f128: 1654; ALL32: # %bb.0: 1655; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1656; ALL32-NEXT: # ymm1 = mem[0,1,0,1] 1657; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 1658; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 1659; ALL32-NEXT: retl 1660; 1661; AVX-64-LABEL: f8xf32_f128: 1662; AVX-64: # %bb.0: 1663; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1664; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] 1665; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 1666; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 1667; AVX-64-NEXT: retq 1668; 1669; ALL64-LABEL: f8xf32_f128: 1670; ALL64: # %bb.0: 1671; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1672; ALL64-NEXT: # ymm1 = mem[0,1,0,1] 1673; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 1674; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 1675; ALL64-NEXT: retq 1676 %res1 = fadd <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a 1677 %res2 = fdiv <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1 1678 ret <8 x float> %res2 1679} 1680 1681 1682define <16 x float> @f16xf32_f64(<16 x float> %a) { 1683; AVX-LABEL: f16xf32_f64: 1684; AVX: # %bb.0: 1685; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1686; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 1687; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 1688; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 1689; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 1690; AVX-NEXT: retl 1691; 1692; AVX2-LABEL: f16xf32_f64: 1693; AVX2: # %bb.0: 1694; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1695; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 1696; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 1697; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 1698; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 1699; AVX2-NEXT: retl 1700; 1701; AVX512-LABEL: f16xf32_f64: 1702; AVX512: # %bb.0: 1703; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1704; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 1705; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 1706; AVX512-NEXT: retl 1707; 1708; AVX-64-LABEL: f16xf32_f64: 1709; AVX-64: # %bb.0: 1710; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1711; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 1712; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 1713; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 1714; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 1715; AVX-64-NEXT: retq 1716; 1717; AVX2-64-LABEL: f16xf32_f64: 1718; AVX2-64: # %bb.0: 1719; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1720; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 1721; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 1722; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 1723; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 1724; AVX2-64-NEXT: retq 1725; 1726; AVX512F-64-LABEL: f16xf32_f64: 1727; AVX512F-64: # %bb.0: 1728; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] 1729; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 1730; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 1731; AVX512F-64-NEXT: retq 1732 %res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a 1733 %res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1 1734 ret <16 x float> %res2 1735} 1736 1737 1738define <16 x float> @f16xf32_f128(<16 x float> %a) { 1739; AVX-LABEL: f16xf32_f128: 1740; AVX: # %bb.0: 1741; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1742; AVX-NEXT: # ymm2 = mem[0,1,0,1] 1743; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 1744; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 1745; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 1746; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 1747; AVX-NEXT: retl 1748; 1749; AVX2-LABEL: f16xf32_f128: 1750; AVX2: # %bb.0: 1751; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1752; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 1753; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 1754; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 1755; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 1756; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 1757; AVX2-NEXT: retl 1758; 1759; AVX512-LABEL: f16xf32_f128: 1760; AVX512: # %bb.0: 1761; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1762; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1763; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 1764; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 1765; AVX512-NEXT: retl 1766; 1767; AVX-64-LABEL: f16xf32_f128: 1768; AVX-64: # %bb.0: 1769; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1770; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] 1771; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 1772; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 1773; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 1774; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 1775; AVX-64-NEXT: retq 1776; 1777; AVX2-64-LABEL: f16xf32_f128: 1778; AVX2-64: # %bb.0: 1779; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1780; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] 1781; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 1782; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 1783; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 1784; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 1785; AVX2-64-NEXT: retq 1786; 1787; AVX512F-64-LABEL: f16xf32_f128: 1788; AVX512F-64: # %bb.0: 1789; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1790; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1791; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 1792; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 1793; AVX512F-64-NEXT: retq 1794 %res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a 1795 %res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1 1796 ret <16 x float> %res2 1797} 1798 1799 1800define <16 x float> @f16xf32_f256(<16 x float> %a) { 1801; AVX-LABEL: f16xf32_f256: 1802; AVX: # %bb.0: 1803; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] 1804; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 1805; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 1806; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 1807; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 1808; AVX-NEXT: retl 1809; 1810; AVX2-LABEL: f16xf32_f256: 1811; AVX2: # %bb.0: 1812; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] 1813; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 1814; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 1815; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 1816; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 1817; AVX2-NEXT: retl 1818; 1819; AVX512-LABEL: f16xf32_f256: 1820; AVX512: # %bb.0: 1821; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] 1822; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1823; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 1824; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 1825; AVX512-NEXT: retl 1826; 1827; AVX-64-LABEL: f16xf32_f256: 1828; AVX-64: # %bb.0: 1829; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] 1830; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 1831; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 1832; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 1833; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 1834; AVX-64-NEXT: retq 1835; 1836; AVX2-64-LABEL: f16xf32_f256: 1837; AVX2-64: # %bb.0: 1838; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] 1839; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 1840; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 1841; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 1842; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 1843; AVX2-64-NEXT: retq 1844; 1845; AVX512F-64-LABEL: f16xf32_f256: 1846; AVX512F-64: # %bb.0: 1847; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] 1848; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1849; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 1850; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 1851; AVX512F-64-NEXT: retq 1852 %res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a 1853 %res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1 1854 ret <16 x float> %res2 1855} 1856 1857 1858define <4 x double> @f4xf64_f128(<4 x double> %a) { 1859; AVX-LABEL: f4xf64_f128: 1860; AVX: # %bb.0: 1861; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1862; AVX-NEXT: # ymm1 = mem[0,1,0,1] 1863; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1864; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 1865; AVX-NEXT: retl 1866; 1867; ALL32-LABEL: f4xf64_f128: 1868; ALL32: # %bb.0: 1869; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1870; ALL32-NEXT: # ymm1 = mem[0,1,0,1] 1871; ALL32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1872; ALL32-NEXT: vdivpd %ymm0, %ymm1, %ymm0 1873; ALL32-NEXT: retl 1874; 1875; AVX-64-LABEL: f4xf64_f128: 1876; AVX-64: # %bb.0: 1877; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1878; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] 1879; AVX-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1880; AVX-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 1881; AVX-64-NEXT: retq 1882; 1883; ALL64-LABEL: f4xf64_f128: 1884; ALL64: # %bb.0: 1885; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1886; ALL64-NEXT: # ymm1 = mem[0,1,0,1] 1887; ALL64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1888; ALL64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 1889; ALL64-NEXT: retq 1890 %res1 = fadd <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %a 1891 %res2 = fdiv <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %res1 1892 ret <4 x double> %res2 1893} 1894 1895 1896define <8 x double> @f8xf64_f128(<8 x double> %a) { 1897; AVX-LABEL: f8xf64_f128: 1898; AVX: # %bb.0: 1899; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1900; AVX-NEXT: # ymm2 = mem[0,1,0,1] 1901; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 1902; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1903; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 1904; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 1905; AVX-NEXT: retl 1906; 1907; AVX2-LABEL: f8xf64_f128: 1908; AVX2: # %bb.0: 1909; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1910; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 1911; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 1912; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1913; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 1914; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 1915; AVX2-NEXT: retl 1916; 1917; AVX512-LABEL: f8xf64_f128: 1918; AVX512: # %bb.0: 1919; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1920; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1921; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1922; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 1923; AVX512-NEXT: retl 1924; 1925; AVX-64-LABEL: f8xf64_f128: 1926; AVX-64: # %bb.0: 1927; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1928; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] 1929; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 1930; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1931; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 1932; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 1933; AVX-64-NEXT: retq 1934; 1935; AVX2-64-LABEL: f8xf64_f128: 1936; AVX2-64: # %bb.0: 1937; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1938; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] 1939; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 1940; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1941; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 1942; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 1943; AVX2-64-NEXT: retq 1944; 1945; AVX512F-64-LABEL: f8xf64_f128: 1946; AVX512F-64: # %bb.0: 1947; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0] 1948; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1949; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1950; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0 1951; AVX512F-64-NEXT: retq 1952 %res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a 1953 %res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1 1954 ret <8 x double> %res2 1955} 1956 1957 1958; AVX512: .LCPI37 1959; AVX512-NEXT: .quad 0x4010000000000000 # double 4 1960; AVX512-NEXT: .quad 0x3ff0000000000000 # double 1 1961; AVX512-NEXT: .quad 0x4000000000000000 # double 2 1962; AVX512-NEXT: .quad 0x4008000000000000 # double 3 1963; AVX512-NOT: .quad 1964 1965define <8 x double> @f8xf64_f256(<8 x double> %a) { 1966; AVX-LABEL: f8xf64_f256: 1967; AVX: # %bb.0: 1968; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1969; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 1970; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1971; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 1972; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 1973; AVX-NEXT: retl 1974; 1975; AVX2-LABEL: f8xf64_f256: 1976; AVX2: # %bb.0: 1977; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1978; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 1979; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1980; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 1981; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 1982; AVX2-NEXT: retl 1983; 1984; AVX512-LABEL: f8xf64_f256: 1985; AVX512: # %bb.0: 1986; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1987; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1988; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1989; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 1990; AVX512-NEXT: retl 1991; 1992; AVX-64-LABEL: f8xf64_f256: 1993; AVX-64: # %bb.0: 1994; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] 1995; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 1996; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1997; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 1998; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 1999; AVX-64-NEXT: retq 2000; 2001; AVX2-64-LABEL: f8xf64_f256: 2002; AVX2-64: # %bb.0: 2003; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] 2004; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 2005; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 2006; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 2007; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 2008; AVX2-64-NEXT: retq 2009; 2010; AVX512F-64-LABEL: f8xf64_f256: 2011; AVX512F-64: # %bb.0: 2012; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] 2013; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 2014; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 2015; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0 2016; AVX512F-64-NEXT: retq 2017 %res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a 2018 %res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1 2019 ret <8 x double> %res2 2020} 2021 2022 2023define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) { 2024; AVX-LABEL: f8xi16_i32_NaN: 2025; AVX: # %bb.0: 2026; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] 2027; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2028; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 2029; AVX-NEXT: retl 2030; 2031; ALL32-LABEL: f8xi16_i32_NaN: 2032; ALL32: # %bb.0: 2033; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] 2034; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2035; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 2036; ALL32-NEXT: retl 2037; 2038; AVX-64-LABEL: f8xi16_i32_NaN: 2039; AVX-64: # %bb.0: 2040; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] 2041; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2042; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 2043; AVX-64-NEXT: retq 2044; 2045; ALL64-LABEL: f8xi16_i32_NaN: 2046; ALL64: # %bb.0: 2047; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] 2048; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2049; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 2050; ALL64-NEXT: retq 2051 %res1 = add <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %a 2052 %res2 = and <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %res1 2053 ret <8 x i16> %res2 2054} 2055