1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c 6 7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) { 8; CHECK-LABEL: test_mm256_abs_epi8: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vpabsb %ymm0, %ymm0 11; CHECK-NEXT: ret{{[l|q]}} 12 %arg = bitcast <4 x i64> %a0 to <32 x i8> 13 %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false) 14 %res = bitcast <32 x i8> %abs to <4 x i64> 15 ret <4 x i64> %res 16} 17declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone 18 19define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) { 20; CHECK-LABEL: test_mm256_abs_epi16: 21; CHECK: # %bb.0: 22; CHECK-NEXT: vpabsw %ymm0, %ymm0 23; CHECK-NEXT: ret{{[l|q]}} 24 %arg = bitcast <4 x i64> %a0 to <16 x i16> 25 %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false) 26 %res = bitcast <16 x i16> %abs to <4 x i64> 27 ret <4 x i64> %res 28} 29declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone 30 31define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) { 32; CHECK-LABEL: test_mm256_abs_epi32: 33; CHECK: # %bb.0: 34; CHECK-NEXT: vpabsd %ymm0, %ymm0 35; CHECK-NEXT: ret{{[l|q]}} 36 %arg = bitcast <4 x i64> %a0 to <8 x i32> 37 %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false) 38 %res = bitcast <8 x i32> %abs to <4 x i64> 39 ret <4 x i64> %res 40} 41declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone 42 43define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 44; CHECK-LABEL: test_mm256_add_epi8: 45; CHECK: # %bb.0: 46; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 47; CHECK-NEXT: ret{{[l|q]}} 48 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 49 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 50 %res = add <32 x i8> %arg0, %arg1 51 %bc = bitcast <32 x i8> %res to <4 x i64> 52 ret <4 x i64> %bc 53} 54 55define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 56; CHECK-LABEL: test_mm256_add_epi16: 57; CHECK: # %bb.0: 58; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 59; CHECK-NEXT: ret{{[l|q]}} 60 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 61 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 62 %res = add <16 x i16> %arg0, %arg1 63 %bc = bitcast <16 x i16> %res to <4 x i64> 64 ret <4 x i64> %bc 65} 66 67define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 68; CHECK-LABEL: test_mm256_add_epi32: 69; CHECK: # %bb.0: 70; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 71; CHECK-NEXT: ret{{[l|q]}} 72 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 73 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 74 %res = add <8 x i32> %arg0, %arg1 75 %bc = bitcast <8 x i32> %res to <4 x i64> 76 ret <4 x i64> %bc 77} 78 79define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 80; CHECK-LABEL: test_mm256_add_epi64: 81; CHECK: # %bb.0: 82; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 83; CHECK-NEXT: ret{{[l|q]}} 84 %res = add <4 x i64> %a0, %a1 85 ret <4 x i64> %res 86} 87 88define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) { 89; CHECK-LABEL: test_mm256_adds_epi8: 90; CHECK: # %bb.0: 91; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 92; CHECK-NEXT: ret{{[l|q]}} 93 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 94 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 95 %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 96 %bc = bitcast <32 x i8> %res to <4 x i64> 97 ret <4 x i64> %bc 98} 99declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 100 101define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 102; CHECK-LABEL: test_mm256_adds_epi16: 103; CHECK: # %bb.0: 104; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 105; CHECK-NEXT: ret{{[l|q]}} 106 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 107 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 108 %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 109 %bc = bitcast <16 x i16> %res to <4 x i64> 110 ret <4 x i64> %bc 111} 112declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 113 114define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { 115; CHECK-LABEL: test_mm256_adds_epu8: 116; CHECK: # %bb.0: 117; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 118; CHECK-NEXT: ret{{[l|q]}} 119 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 120 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 121 %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 122 %bc = bitcast <32 x i8> %res to <4 x i64> 123 ret <4 x i64> %bc 124} 125declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) 126 127define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { 128; CHECK-LABEL: test_mm256_adds_epu16: 129; CHECK: # %bb.0: 130; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 131; CHECK-NEXT: ret{{[l|q]}} 132 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 133 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 134 %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 135 %bc = bitcast <16 x i16> %res to <4 x i64> 136 ret <4 x i64> %bc 137} 138declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) 139 140define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 141; CHECK-LABEL: test_mm256_alignr_epi8: 142; CHECK: # %bb.0: 143; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] 144; CHECK-NEXT: ret{{[l|q]}} 145 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 146 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 147 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49> 148 %res = bitcast <32 x i8> %shuf to <4 x i64> 149 ret <4 x i64> %res 150} 151 152define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 153; CHECK-LABEL: test2_mm256_alignr_epi8: 154; CHECK: # %bb.0: 155; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] 156; CHECK-NEXT: ret{{[l|q]}} 157 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 158 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 159 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 160 %res = bitcast <32 x i8> %shuf to <4 x i64> 161 ret <4 x i64> %res 162} 163 164define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 165; CHECK-LABEL: test_mm256_and_si256: 166; CHECK: # %bb.0: 167; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 168; CHECK-NEXT: ret{{[l|q]}} 169 %res = and <4 x i64> %a0, %a1 170 ret <4 x i64> %res 171} 172 173define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 174; CHECK-LABEL: test_mm256_andnot_si256: 175; CHECK: # %bb.0: 176; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 177; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0 178; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 179; CHECK-NEXT: ret{{[l|q]}} 180 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1> 181 %res = and <4 x i64> %not, %a1 182 ret <4 x i64> %res 183} 184 185define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 186; CHECK-LABEL: test_mm256_avg_epu8: 187; CHECK: # %bb.0: 188; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 189; CHECK-NEXT: ret{{[l|q]}} 190 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 191 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 192 %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1) 193 %bc = bitcast <32 x i8> %res to <4 x i64> 194 ret <4 x i64> %bc 195} 196declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone 197 198define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 199; CHECK-LABEL: test_mm256_avg_epu16: 200; CHECK: # %bb.0: 201; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 202; CHECK-NEXT: ret{{[l|q]}} 203 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 204 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 205 %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1) 206 %bc = bitcast <16 x i16> %res to <4 x i64> 207 ret <4 x i64> %bc 208} 209declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone 210 211define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) { 212; CHECK-LABEL: test_mm256_blend_epi16: 213; CHECK: # %bb.0: 214; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] 215; CHECK-NEXT: ret{{[l|q]}} 216 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 217 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 218 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 219 %res = bitcast <16 x i16> %shuf to <4 x i64> 220 ret <4 x i64> %res 221} 222 223define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) { 224; CHECK-LABEL: test_mm_blend_epi32: 225; CHECK: # %bb.0: 226; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 227; CHECK-NEXT: ret{{[l|q]}} 228 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 229 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 230 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 231 %res = bitcast <4 x i32> %shuf to <2 x i64> 232 ret <2 x i64> %res 233} 234 235define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) { 236; CHECK-LABEL: test_mm256_blend_epi32: 237; CHECK: # %bb.0: 238; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] 239; CHECK-NEXT: ret{{[l|q]}} 240 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 241 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 242 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7> 243 %res = bitcast <8 x i32> %shuf to <4 x i64> 244 ret <4 x i64> %res 245} 246 247define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { 248; CHECK-LABEL: test_mm256_blendv_epi8: 249; CHECK: # %bb.0: 250; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 251; CHECK-NEXT: ret{{[l|q]}} 252 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 253 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 254 %arg2 = bitcast <4 x i64> %a2 to <32 x i8> 255 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2) 256 %res = bitcast <32 x i8> %call to <4 x i64> 257 ret <4 x i64> %res 258} 259declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone 260 261define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) { 262; CHECK-LABEL: test_mm_broadcastb_epi8: 263; CHECK: # %bb.0: 264; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 265; CHECK-NEXT: ret{{[l|q]}} 266 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 267 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer 268 %res = bitcast <16 x i8> %shuf to <2 x i64> 269 ret <2 x i64> %res 270} 271 272define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) { 273; CHECK-LABEL: test_mm256_broadcastb_epi8: 274; CHECK: # %bb.0: 275; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 276; CHECK-NEXT: ret{{[l|q]}} 277 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 278 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer 279 %res = bitcast <32 x i8> %shuf to <4 x i64> 280 ret <4 x i64> %res 281} 282 283define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 284; CHECK-LABEL: test_mm_broadcastd_epi32: 285; CHECK: # %bb.0: 286; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 287; CHECK-NEXT: ret{{[l|q]}} 288 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 289 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 290 %res = bitcast <4 x i32> %shuf to <2 x i64> 291 ret <2 x i64> %res 292} 293 294define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) { 295; CHECK-LABEL: test_mm256_broadcastd_epi32: 296; CHECK: # %bb.0: 297; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 298; CHECK-NEXT: ret{{[l|q]}} 299 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 300 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer 301 %res = bitcast <8 x i32> %shuf to <4 x i64> 302 ret <4 x i64> %res 303} 304 305define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 306; CHECK-LABEL: test_mm_broadcastq_epi64: 307; CHECK: # %bb.0: 308; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 309; CHECK-NEXT: ret{{[l|q]}} 310 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 311 ret <2 x i64> %res 312} 313 314define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) { 315; CHECK-LABEL: test_mm256_broadcastq_epi64: 316; CHECK: # %bb.0: 317; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 318; CHECK-NEXT: ret{{[l|q]}} 319 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer 320 ret <4 x i64> %res 321} 322 323define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { 324; CHECK-LABEL: test_mm_broadcastsd_pd: 325; CHECK: # %bb.0: 326; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 327; CHECK-NEXT: ret{{[l|q]}} 328 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 329 ret <2 x double> %res 330} 331 332define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) { 333; CHECK-LABEL: test_mm256_broadcastsd_pd: 334; CHECK: # %bb.0: 335; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 336; CHECK-NEXT: ret{{[l|q]}} 337 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer 338 ret <4 x double> %res 339} 340 341define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) { 342; CHECK-LABEL: test_mm256_broadcastsi128_si256: 343; CHECK: # %bb.0: 344; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 345; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 346; CHECK-NEXT: ret{{[l|q]}} 347 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 348 ret <4 x i64> %res 349} 350 351define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) { 352; X86-LABEL: test_mm256_broadcastsi128_si256_mem: 353; X86: # %bb.0: 354; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 355; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 356; X86-NEXT: retl 357; 358; X64-LABEL: test_mm256_broadcastsi128_si256_mem: 359; X64: # %bb.0: 360; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 361; X64-NEXT: retq 362 %a0 = load <2 x i64>, <2 x i64>* %p0 363 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 364 ret <4 x i64> %res 365} 366 367define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 368; CHECK-LABEL: test_mm_broadcastss_ps: 369; CHECK: # %bb.0: 370; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 371; CHECK-NEXT: ret{{[l|q]}} 372 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 373 ret <4 x float> %res 374} 375 376define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) { 377; CHECK-LABEL: test_mm256_broadcastss_ps: 378; CHECK: # %bb.0: 379; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 380; CHECK-NEXT: ret{{[l|q]}} 381 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer 382 ret <8 x float> %res 383} 384 385define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) { 386; CHECK-LABEL: test_mm_broadcastw_epi16: 387; CHECK: # %bb.0: 388; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 389; CHECK-NEXT: ret{{[l|q]}} 390 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 391 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer 392 %res = bitcast <8 x i16> %shuf to <2 x i64> 393 ret <2 x i64> %res 394} 395 396define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) { 397; CHECK-LABEL: test_mm256_broadcastw_epi16: 398; CHECK: # %bb.0: 399; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 400; CHECK-NEXT: ret{{[l|q]}} 401 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 402 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer 403 %res = bitcast <16 x i16> %shuf to <4 x i64> 404 ret <4 x i64> %res 405} 406 407define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) { 408; CHECK-LABEL: test_mm256_bslli_epi128: 409; CHECK: # %bb.0: 410; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 411; CHECK-NEXT: ret{{[l|q]}} 412 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 413 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 414 %res = bitcast <32 x i8> %shuf to <4 x i64> 415 ret <4 x i64> %res 416} 417 418define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) { 419; CHECK-LABEL: test_mm256_bsrli_epi128: 420; CHECK: # %bb.0: 421; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 422; CHECK-NEXT: ret{{[l|q]}} 423 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 424 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 425 %res = bitcast <32 x i8> %shuf to <4 x i64> 426 ret <4 x i64> %res 427} 428 429define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 430; CHECK-LABEL: test_mm256_cmpeq_epi8: 431; CHECK: # %bb.0: 432; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 433; CHECK-NEXT: ret{{[l|q]}} 434 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 435 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 436 %cmp = icmp eq <32 x i8> %arg0, %arg1 437 %res = sext <32 x i1> %cmp to <32 x i8> 438 %bc = bitcast <32 x i8> %res to <4 x i64> 439 ret <4 x i64> %bc 440} 441 442define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 443; CHECK-LABEL: test_mm256_cmpeq_epi16: 444; CHECK: # %bb.0: 445; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 446; CHECK-NEXT: ret{{[l|q]}} 447 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 448 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 449 %cmp = icmp eq <16 x i16> %arg0, %arg1 450 %res = sext <16 x i1> %cmp to <16 x i16> 451 %bc = bitcast <16 x i16> %res to <4 x i64> 452 ret <4 x i64> %bc 453} 454 455define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 456; CHECK-LABEL: test_mm256_cmpeq_epi32: 457; CHECK: # %bb.0: 458; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 459; CHECK-NEXT: ret{{[l|q]}} 460 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 461 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 462 %cmp = icmp eq <8 x i32> %arg0, %arg1 463 %res = sext <8 x i1> %cmp to <8 x i32> 464 %bc = bitcast <8 x i32> %res to <4 x i64> 465 ret <4 x i64> %bc 466} 467 468define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 469; CHECK-LABEL: test_mm256_cmpeq_epi64: 470; CHECK: # %bb.0: 471; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 472; CHECK-NEXT: ret{{[l|q]}} 473 %cmp = icmp eq <4 x i64> %a0, %a1 474 %res = sext <4 x i1> %cmp to <4 x i64> 475 ret <4 x i64> %res 476} 477 478define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 479; CHECK-LABEL: test_mm256_cmpgt_epi8: 480; CHECK: # %bb.0: 481; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 482; CHECK-NEXT: ret{{[l|q]}} 483 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 484 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 485 %cmp = icmp sgt <32 x i8> %arg0, %arg1 486 %res = sext <32 x i1> %cmp to <32 x i8> 487 %bc = bitcast <32 x i8> %res to <4 x i64> 488 ret <4 x i64> %bc 489} 490 491define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 492; CHECK-LABEL: test_mm256_cmpgt_epi16: 493; CHECK: # %bb.0: 494; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 495; CHECK-NEXT: ret{{[l|q]}} 496 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 497 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 498 %cmp = icmp sgt <16 x i16> %arg0, %arg1 499 %res = sext <16 x i1> %cmp to <16 x i16> 500 %bc = bitcast <16 x i16> %res to <4 x i64> 501 ret <4 x i64> %bc 502} 503 504define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 505; CHECK-LABEL: test_mm256_cmpgt_epi32: 506; CHECK: # %bb.0: 507; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 508; CHECK-NEXT: ret{{[l|q]}} 509 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 510 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 511 %cmp = icmp sgt <8 x i32> %arg0, %arg1 512 %res = sext <8 x i1> %cmp to <8 x i32> 513 %bc = bitcast <8 x i32> %res to <4 x i64> 514 ret <4 x i64> %bc 515} 516 517define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 518; CHECK-LABEL: test_mm256_cmpgt_epi64: 519; CHECK: # %bb.0: 520; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 521; CHECK-NEXT: ret{{[l|q]}} 522 %cmp = icmp sgt <4 x i64> %a0, %a1 523 %res = sext <4 x i1> %cmp to <4 x i64> 524 ret <4 x i64> %res 525} 526 527define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) { 528; CHECK-LABEL: test_mm256_cvtepi8_epi16: 529; CHECK: # %bb.0: 530; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 531; CHECK-NEXT: ret{{[l|q]}} 532 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 533 %ext = sext <16 x i8> %arg0 to <16 x i16> 534 %res = bitcast <16 x i16> %ext to <4 x i64> 535 ret <4 x i64> %res 536} 537 538define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) { 539; CHECK-LABEL: test_mm256_cvtepi8_epi32: 540; CHECK: # %bb.0: 541; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 542; CHECK-NEXT: ret{{[l|q]}} 543 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 544 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 545 %ext = sext <8 x i8> %shuf to <8 x i32> 546 %res = bitcast <8 x i32> %ext to <4 x i64> 547 ret <4 x i64> %res 548} 549 550define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) { 551; CHECK-LABEL: test_mm256_cvtepi8_epi64: 552; CHECK: # %bb.0: 553; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 554; CHECK-NEXT: ret{{[l|q]}} 555 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 556 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 557 %ext = sext <4 x i8> %shuf to <4 x i64> 558 ret <4 x i64> %ext 559} 560 561define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) { 562; CHECK-LABEL: test_mm256_cvtepi16_epi32: 563; CHECK: # %bb.0: 564; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 565; CHECK-NEXT: ret{{[l|q]}} 566 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 567 %ext = sext <8 x i16> %arg0 to <8 x i32> 568 %res = bitcast <8 x i32> %ext to <4 x i64> 569 ret <4 x i64> %res 570} 571 572define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) { 573; CHECK-LABEL: test_mm256_cvtepi16_epi64: 574; CHECK: # %bb.0: 575; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 576; CHECK-NEXT: ret{{[l|q]}} 577 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 578 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 579 %ext = sext <4 x i16> %shuf to <4 x i64> 580 ret <4 x i64> %ext 581} 582 583define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) { 584; CHECK-LABEL: test_mm256_cvtepi32_epi64: 585; CHECK: # %bb.0: 586; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 587; CHECK-NEXT: ret{{[l|q]}} 588 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 589 %ext = sext <4 x i32> %arg0 to <4 x i64> 590 ret <4 x i64> %ext 591} 592 593define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) { 594; CHECK-LABEL: test_mm256_cvtepu8_epi16: 595; CHECK: # %bb.0: 596; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 597; CHECK-NEXT: ret{{[l|q]}} 598 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 599 %ext = zext <16 x i8> %arg0 to <16 x i16> 600 %res = bitcast <16 x i16> %ext to <4 x i64> 601 ret <4 x i64> %res 602} 603 604define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) { 605; CHECK-LABEL: test_mm256_cvtepu8_epi32: 606; CHECK: # %bb.0: 607; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 608; CHECK-NEXT: ret{{[l|q]}} 609 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 610 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 611 %ext = zext <8 x i8> %shuf to <8 x i32> 612 %res = bitcast <8 x i32> %ext to <4 x i64> 613 ret <4 x i64> %res 614} 615 616define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) { 617; CHECK-LABEL: test_mm256_cvtepu8_epi64: 618; CHECK: # %bb.0: 619; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 620; CHECK-NEXT: ret{{[l|q]}} 621 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 622 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 623 %ext = zext <4 x i8> %shuf to <4 x i64> 624 ret <4 x i64> %ext 625} 626 627define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) { 628; CHECK-LABEL: test_mm256_cvtepu16_epi32: 629; CHECK: # %bb.0: 630; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 631; CHECK-NEXT: ret{{[l|q]}} 632 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 633 %ext = zext <8 x i16> %arg0 to <8 x i32> 634 %res = bitcast <8 x i32> %ext to <4 x i64> 635 ret <4 x i64> %res 636} 637 638define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) { 639; CHECK-LABEL: test_mm256_cvtepu16_epi64: 640; CHECK: # %bb.0: 641; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 642; CHECK-NEXT: ret{{[l|q]}} 643 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 644 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 645 %ext = zext <4 x i16> %shuf to <4 x i64> 646 ret <4 x i64> %ext 647} 648 649define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) { 650; CHECK-LABEL: test_mm256_cvtepu32_epi64: 651; CHECK: # %bb.0: 652; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 653; CHECK-NEXT: ret{{[l|q]}} 654 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 655 %ext = zext <4 x i32> %arg0 to <4 x i64> 656 ret <4 x i64> %ext 657} 658 659define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind { 660; CHECK-LABEL: test_mm256_extracti128_si256: 661; CHECK: # %bb.0: 662; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 663; CHECK-NEXT: vzeroupper 664; CHECK-NEXT: ret{{[l|q]}} 665 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3> 666 ret <2 x i64> %res 667} 668 669define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 670; CHECK-LABEL: test_mm256_hadd_epi16: 671; CHECK: # %bb.0: 672; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 673; CHECK-NEXT: ret{{[l|q]}} 674 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 675 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 676 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1) 677 %bc = bitcast <16 x i16> %res to <4 x i64> 678 ret <4 x i64> %bc 679} 680declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone 681 682define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) { 683; CHECK-LABEL: test_mm256_hadd_epi32: 684; CHECK: # %bb.0: 685; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 686; CHECK-NEXT: ret{{[l|q]}} 687 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 688 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 689 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1) 690 %bc = bitcast <8 x i32> %res to <4 x i64> 691 ret <4 x i64> %bc 692} 693declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone 694 695define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 696; CHECK-LABEL: test_mm256_hadds_epi16: 697; CHECK: # %bb.0: 698; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 699; CHECK-NEXT: ret{{[l|q]}} 700 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 701 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 702 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1) 703 %bc = bitcast <16 x i16> %res to <4 x i64> 704 ret <4 x i64> %bc 705} 706declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone 707 708define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) { 709; CHECK-LABEL: test_mm256_hsub_epi16: 710; CHECK: # %bb.0: 711; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0 712; CHECK-NEXT: ret{{[l|q]}} 713 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 714 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 715 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1) 716 %bc = bitcast <16 x i16> %res to <4 x i64> 717 ret <4 x i64> %bc 718} 719declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone 720 721define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) { 722; CHECK-LABEL: test_mm256_hsub_epi32: 723; CHECK: # %bb.0: 724; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 725; CHECK-NEXT: ret{{[l|q]}} 726 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 727 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 728 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1) 729 %bc = bitcast <8 x i32> %res to <4 x i64> 730 ret <4 x i64> %bc 731} 732declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone 733 734define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 735; CHECK-LABEL: test_mm256_hsubs_epi16: 736; CHECK: # %bb.0: 737; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 738; CHECK-NEXT: ret{{[l|q]}} 739 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 740 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 741 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1) 742 %bc = bitcast <16 x i16> %res to <4 x i64> 743 ret <4 x i64> %bc 744} 745declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone 746 747define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) { 748; X86-LABEL: test_mm_i32gather_epi32: 749; X86: # %bb.0: 750; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 751; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 752; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 753; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1 754; X86-NEXT: vmovdqa %xmm1, %xmm0 755; X86-NEXT: retl 756; 757; X64-LABEL: test_mm_i32gather_epi32: 758; X64: # %bb.0: 759; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 760; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 761; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1 762; X64-NEXT: vmovdqa %xmm1, %xmm0 763; X64-NEXT: retq 764 %arg0 = bitcast i32 *%a0 to i8* 765 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 766 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 767 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2) 768 %bc = bitcast <4 x i32> %call to <2 x i64> 769 ret <2 x i64> %bc 770} 771declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly 772 773define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 774; X86-LABEL: test_mm_mask_i32gather_epi32: 775; X86: # %bb.0: 776; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 777; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 778; X86-NEXT: retl 779; 780; X64-LABEL: test_mm_mask_i32gather_epi32: 781; X64: # %bb.0: 782; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 783; X64-NEXT: retq 784 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 785 %arg1 = bitcast i32 *%a1 to i8* 786 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 787 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 788 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2) 789 %bc = bitcast <4 x i32> %call to <2 x i64> 790 ret <2 x i64> %bc 791} 792 793define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) { 794; X86-LABEL: test_mm256_i32gather_epi32: 795; X86: # %bb.0: 796; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 797; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 798; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 799; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1 800; X86-NEXT: vmovdqa %ymm1, %ymm0 801; X86-NEXT: retl 802; 803; X64-LABEL: test_mm256_i32gather_epi32: 804; X64: # %bb.0: 805; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 806; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 807; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1 808; X64-NEXT: vmovdqa %ymm1, %ymm0 809; X64-NEXT: retq 810 %arg0 = bitcast i32 *%a0 to i8* 811 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 812 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32> 813 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2) 814 %bc = bitcast <8 x i32> %call to <4 x i64> 815 ret <4 x i64> %bc 816} 817declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly 818 819define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) { 820; X86-LABEL: test_mm256_mask_i32gather_epi32: 821; X86: # %bb.0: 822; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 823; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 824; X86-NEXT: retl 825; 826; X64-LABEL: test_mm256_mask_i32gather_epi32: 827; X64: # %bb.0: 828; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 829; X64-NEXT: retq 830 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 831 %arg1 = bitcast i32 *%a1 to i8* 832 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 833 %arg3 = bitcast <4 x i64> %a3 to <8 x i32> 834 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2) 835 %bc = bitcast <8 x i32> %call to <4 x i64> 836 ret <4 x i64> %bc 837} 838 839define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) { 840; X86-LABEL: test_mm_i32gather_epi64: 841; X86: # %bb.0: 842; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 843; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 844; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 845; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1 846; X86-NEXT: vmovdqa %xmm1, %xmm0 847; X86-NEXT: retl 848; 849; X64-LABEL: test_mm_i32gather_epi64: 850; X64: # %bb.0: 851; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 852; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 853; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1 854; X64-NEXT: vmovdqa %xmm1, %xmm0 855; X64-NEXT: retq 856 %arg0 = bitcast i64 *%a0 to i8* 857 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 858 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2) 859 ret <2 x i64> %res 860} 861declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly 862 863define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 864; X86-LABEL: test_mm_mask_i32gather_epi64: 865; X86: # %bb.0: 866; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 867; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 868; X86-NEXT: retl 869; 870; X64-LABEL: test_mm_mask_i32gather_epi64: 871; X64: # %bb.0: 872; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 873; X64-NEXT: retq 874 %arg1 = bitcast i64 *%a1 to i8* 875 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 876 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2) 877 ret <2 x i64> %res 878} 879 880define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) { 881; X86-LABEL: test_mm256_i32gather_epi64: 882; X86: # %bb.0: 883; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 884; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 885; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 886; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1 887; X86-NEXT: vmovdqa %ymm1, %ymm0 888; X86-NEXT: retl 889; 890; X64-LABEL: test_mm256_i32gather_epi64: 891; X64: # %bb.0: 892; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 893; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 894; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1 895; X64-NEXT: vmovdqa %ymm1, %ymm0 896; X64-NEXT: retq 897 %arg0 = bitcast i64 *%a0 to i8* 898 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 899 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 900 ret <4 x i64> %res 901} 902declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly 903 904define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) { 905; X86-LABEL: test_mm256_mask_i32gather_epi64: 906; X86: # %bb.0: 907; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 908; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 909; X86-NEXT: retl 910; 911; X64-LABEL: test_mm256_mask_i32gather_epi64: 912; X64: # %bb.0: 913; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 914; X64-NEXT: retq 915 %arg1 = bitcast i64 *%a1 to i8* 916 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 917 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2) 918 ret <4 x i64> %res 919} 920 921define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) { 922; X86-LABEL: test_mm_i32gather_pd: 923; X86: # %bb.0: 924; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 925; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 926; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 927; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1 928; X86-NEXT: vmovapd %xmm1, %xmm0 929; X86-NEXT: retl 930; 931; X64-LABEL: test_mm_i32gather_pd: 932; X64: # %bb.0: 933; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 934; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 935; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1 936; X64-NEXT: vmovapd %xmm1, %xmm0 937; X64-NEXT: retq 938 %arg0 = bitcast double *%a0 to i8* 939 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 940 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 941 %sext = sext <2 x i1> %cmp to <2 x i64> 942 %mask = bitcast <2 x i64> %sext to <2 x double> 943 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2) 944 ret <2 x double> %res 945} 946declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly 947 948define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) { 949; X86-LABEL: test_mm_mask_i32gather_pd: 950; X86: # %bb.0: 951; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 952; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 953; X86-NEXT: retl 954; 955; X64-LABEL: test_mm_mask_i32gather_pd: 956; X64: # %bb.0: 957; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 958; X64-NEXT: retq 959 %arg1 = bitcast double *%a1 to i8* 960 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 961 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2) 962 ret <2 x double> %res 963} 964 965define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) { 966; X86-LABEL: test_mm256_i32gather_pd: 967; X86: # %bb.0: 968; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 969; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 970; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 971; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1 972; X86-NEXT: vmovapd %ymm1, %ymm0 973; X86-NEXT: retl 974; 975; X64-LABEL: test_mm256_i32gather_pd: 976; X64: # %bb.0: 977; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 978; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 979; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1 980; X64-NEXT: vmovapd %ymm1, %ymm0 981; X64-NEXT: retq 982 %arg0 = bitcast double *%a0 to i8* 983 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 984 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 985 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2) 986 ret <4 x double> %res 987} 988declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly 989 990define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) { 991; X86-LABEL: test_mm256_mask_i32gather_pd: 992; X86: # %bb.0: 993; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 994; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 995; X86-NEXT: retl 996; 997; X64-LABEL: test_mm256_mask_i32gather_pd: 998; X64: # %bb.0: 999; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0 1000; X64-NEXT: retq 1001 %arg1 = bitcast double *%a1 to i8* 1002 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1003 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2) 1004 ret <4 x double> %res 1005} 1006 1007define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) { 1008; X86-LABEL: test_mm_i32gather_ps: 1009; X86: # %bb.0: 1010; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1011; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1012; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1013; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1 1014; X86-NEXT: vmovaps %xmm1, %xmm0 1015; X86-NEXT: retl 1016; 1017; X64-LABEL: test_mm_i32gather_ps: 1018; X64: # %bb.0: 1019; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1020; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1021; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1 1022; X64-NEXT: vmovaps %xmm1, %xmm0 1023; X64-NEXT: retq 1024 %arg0 = bitcast float *%a0 to i8* 1025 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1026 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1027 %sext = sext <4 x i1> %cmp to <4 x i32> 1028 %mask = bitcast <4 x i32> %sext to <4 x float> 1029 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2) 1030 ret <4 x float> %call 1031} 1032declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly 1033 1034define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) { 1035; X86-LABEL: test_mm_mask_i32gather_ps: 1036; X86: # %bb.0: 1037; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1038; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 1039; X86-NEXT: retl 1040; 1041; X64-LABEL: test_mm_mask_i32gather_ps: 1042; X64: # %bb.0: 1043; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 1044; X64-NEXT: retq 1045 %arg1 = bitcast float *%a1 to i8* 1046 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1047 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2) 1048 ret <4 x float> %call 1049} 1050 1051define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) { 1052; X86-LABEL: test_mm256_i32gather_ps: 1053; X86: # %bb.0: 1054; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1055; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1056; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1057; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1 1058; X86-NEXT: vmovaps %ymm1, %ymm0 1059; X86-NEXT: retl 1060; 1061; X64-LABEL: test_mm256_i32gather_ps: 1062; X64: # %bb.0: 1063; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1064; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1065; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1 1066; X64-NEXT: vmovaps %ymm1, %ymm0 1067; X64-NEXT: retq 1068 %arg0 = bitcast float *%a0 to i8* 1069 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1070 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0) 1071 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2) 1072 ret <8 x float> %call 1073} 1074declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly 1075 1076define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) { 1077; X86-LABEL: test_mm256_mask_i32gather_ps: 1078; X86: # %bb.0: 1079; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1080; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 1081; X86-NEXT: retl 1082; 1083; X64-LABEL: test_mm256_mask_i32gather_ps: 1084; X64: # %bb.0: 1085; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0 1086; X64-NEXT: retq 1087 %arg1 = bitcast float *%a1 to i8* 1088 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1089 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2) 1090 ret <8 x float> %call 1091} 1092 1093define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) { 1094; X86-LABEL: test_mm_i64gather_epi32: 1095; X86: # %bb.0: 1096; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1097; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1098; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1099; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1 1100; X86-NEXT: vmovdqa %xmm1, %xmm0 1101; X86-NEXT: retl 1102; 1103; X64-LABEL: test_mm_i64gather_epi32: 1104; X64: # %bb.0: 1105; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1106; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1107; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1 1108; X64-NEXT: vmovdqa %xmm1, %xmm0 1109; X64-NEXT: retq 1110 %arg0 = bitcast i32 *%a0 to i8* 1111 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1112 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2) 1113 %bc = bitcast <4 x i32> %call to <2 x i64> 1114 ret <2 x i64> %bc 1115} 1116declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly 1117 1118define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1119; X86-LABEL: test_mm_mask_i64gather_epi32: 1120; X86: # %bb.0: 1121; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1122; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 1123; X86-NEXT: retl 1124; 1125; X64-LABEL: test_mm_mask_i64gather_epi32: 1126; X64: # %bb.0: 1127; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 1128; X64-NEXT: retq 1129 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1130 %arg1 = bitcast i32 *%a1 to i8* 1131 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1132 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2) 1133 %bc = bitcast <4 x i32> %call to <2 x i64> 1134 ret <2 x i64> %bc 1135} 1136 1137define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) { 1138; X86-LABEL: test_mm256_i64gather_epi32: 1139; X86: # %bb.0: 1140; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1141; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1142; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1143; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1 1144; X86-NEXT: vmovdqa %xmm1, %xmm0 1145; X86-NEXT: vzeroupper 1146; X86-NEXT: retl 1147; 1148; X64-LABEL: test_mm256_i64gather_epi32: 1149; X64: # %bb.0: 1150; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1151; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1152; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1 1153; X64-NEXT: vmovdqa %xmm1, %xmm0 1154; X64-NEXT: vzeroupper 1155; X64-NEXT: retq 1156 %arg0 = bitcast i32 *%a0 to i8* 1157 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1158 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2) 1159 %bc = bitcast <4 x i32> %call to <2 x i64> 1160 ret <2 x i64> %bc 1161} 1162declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly 1163 1164define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) { 1165; X86-LABEL: test_mm256_mask_i64gather_epi32: 1166; X86: # %bb.0: 1167; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1168; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 1169; X86-NEXT: vzeroupper 1170; X86-NEXT: retl 1171; 1172; X64-LABEL: test_mm256_mask_i64gather_epi32: 1173; X64: # %bb.0: 1174; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 1175; X64-NEXT: vzeroupper 1176; X64-NEXT: retq 1177 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1178 %arg1 = bitcast i32 *%a1 to i8* 1179 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1180 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2) 1181 %bc = bitcast <4 x i32> %call to <2 x i64> 1182 ret <2 x i64> %bc 1183} 1184 1185define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) { 1186; X86-LABEL: test_mm_i64gather_epi64: 1187; X86: # %bb.0: 1188; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1189; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1190; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1191; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1 1192; X86-NEXT: vmovdqa %xmm1, %xmm0 1193; X86-NEXT: retl 1194; 1195; X64-LABEL: test_mm_i64gather_epi64: 1196; X64: # %bb.0: 1197; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1198; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1199; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1 1200; X64-NEXT: vmovdqa %xmm1, %xmm0 1201; X64-NEXT: retq 1202 %arg0 = bitcast i64 *%a0 to i8* 1203 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2) 1204 ret <2 x i64> %call 1205} 1206declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly 1207 1208define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) { 1209; X86-LABEL: test_mm_mask_i64gather_epi64: 1210; X86: # %bb.0: 1211; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1212; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 1213; X86-NEXT: retl 1214; 1215; X64-LABEL: test_mm_mask_i64gather_epi64: 1216; X64: # %bb.0: 1217; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 1218; X64-NEXT: retq 1219 %arg1 = bitcast i64 *%a1 to i8* 1220 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2) 1221 ret <2 x i64> %call 1222} 1223 1224define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) { 1225; X86-LABEL: test_mm256_i64gather_epi64: 1226; X86: # %bb.0: 1227; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1228; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1229; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1230; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1 1231; X86-NEXT: vmovdqa %ymm1, %ymm0 1232; X86-NEXT: retl 1233; 1234; X64-LABEL: test_mm256_i64gather_epi64: 1235; X64: # %bb.0: 1236; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1237; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1238; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1 1239; X64-NEXT: vmovdqa %ymm1, %ymm0 1240; X64-NEXT: retq 1241 %arg0 = bitcast i64 *%a0 to i8* 1242 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 1243 ret <4 x i64> %call 1244} 1245declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly 1246 1247define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) { 1248; X86-LABEL: test_mm256_mask_i64gather_epi64: 1249; X86: # %bb.0: 1250; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1251; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 1252; X86-NEXT: retl 1253; 1254; X64-LABEL: test_mm256_mask_i64gather_epi64: 1255; X64: # %bb.0: 1256; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 1257; X64-NEXT: retq 1258 %arg1 = bitcast i64 *%a1 to i8* 1259 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2) 1260 ret <4 x i64> %call 1261} 1262 1263define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) { 1264; X86-LABEL: test_mm_i64gather_pd: 1265; X86: # %bb.0: 1266; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1267; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1268; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1269; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1 1270; X86-NEXT: vmovapd %xmm1, %xmm0 1271; X86-NEXT: retl 1272; 1273; X64-LABEL: test_mm_i64gather_pd: 1274; X64: # %bb.0: 1275; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1276; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1277; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1 1278; X64-NEXT: vmovapd %xmm1, %xmm0 1279; X64-NEXT: retq 1280 %arg0 = bitcast double *%a0 to i8* 1281 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 1282 %sext = sext <2 x i1> %cmp to <2 x i64> 1283 %mask = bitcast <2 x i64> %sext to <2 x double> 1284 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2) 1285 ret <2 x double> %call 1286} 1287declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly 1288 1289define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) { 1290; X86-LABEL: test_mm_mask_i64gather_pd: 1291; X86: # %bb.0: 1292; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1293; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 1294; X86-NEXT: retl 1295; 1296; X64-LABEL: test_mm_mask_i64gather_pd: 1297; X64: # %bb.0: 1298; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 1299; X64-NEXT: retq 1300 %arg1 = bitcast double *%a1 to i8* 1301 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2) 1302 ret <2 x double> %call 1303} 1304 1305define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) { 1306; X86-LABEL: test_mm256_i64gather_pd: 1307; X86: # %bb.0: 1308; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1309; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1310; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1311; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1 1312; X86-NEXT: vmovapd %ymm1, %ymm0 1313; X86-NEXT: retl 1314; 1315; X64-LABEL: test_mm256_i64gather_pd: 1316; X64: # %bb.0: 1317; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1318; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1319; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1 1320; X64-NEXT: vmovapd %ymm1, %ymm0 1321; X64-NEXT: retq 1322 %arg0 = bitcast double *%a0 to i8* 1323 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 1324 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2) 1325 ret <4 x double> %call 1326} 1327declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly 1328 1329define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) { 1330; X86-LABEL: test_mm256_mask_i64gather_pd: 1331; X86: # %bb.0: 1332; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1333; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 1334; X86-NEXT: retl 1335; 1336; X64-LABEL: test_mm256_mask_i64gather_pd: 1337; X64: # %bb.0: 1338; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0 1339; X64-NEXT: retq 1340 %arg1 = bitcast i64 *%a1 to i8* 1341 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2) 1342 ret <4 x double> %call 1343} 1344 1345define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) { 1346; X86-LABEL: test_mm_i64gather_ps: 1347; X86: # %bb.0: 1348; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1349; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1350; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1351; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1 1352; X86-NEXT: vmovaps %xmm1, %xmm0 1353; X86-NEXT: retl 1354; 1355; X64-LABEL: test_mm_i64gather_ps: 1356; X64: # %bb.0: 1357; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1358; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1359; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1 1360; X64-NEXT: vmovaps %xmm1, %xmm0 1361; X64-NEXT: retq 1362 %arg0 = bitcast float *%a0 to i8* 1363 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1364 %sext = sext <4 x i1> %cmp to <4 x i32> 1365 %mask = bitcast <4 x i32> %sext to <4 x float> 1366 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2) 1367 ret <4 x float> %call 1368} 1369declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly 1370 1371define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) { 1372; X86-LABEL: test_mm_mask_i64gather_ps: 1373; X86: # %bb.0: 1374; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1375; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 1376; X86-NEXT: retl 1377; 1378; X64-LABEL: test_mm_mask_i64gather_ps: 1379; X64: # %bb.0: 1380; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 1381; X64-NEXT: retq 1382 %arg1 = bitcast float *%a1 to i8* 1383 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2) 1384 ret <4 x float> %call 1385} 1386 1387define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) { 1388; X86-LABEL: test_mm256_i64gather_ps: 1389; X86: # %bb.0: 1390; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1391; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1392; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1393; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1 1394; X86-NEXT: vmovaps %xmm1, %xmm0 1395; X86-NEXT: vzeroupper 1396; X86-NEXT: retl 1397; 1398; X64-LABEL: test_mm256_i64gather_ps: 1399; X64: # %bb.0: 1400; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1401; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1402; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1 1403; X64-NEXT: vmovaps %xmm1, %xmm0 1404; X64-NEXT: vzeroupper 1405; X64-NEXT: retq 1406 %arg0 = bitcast float *%a0 to i8* 1407 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1408 %sext = sext <4 x i1> %cmp to <4 x i32> 1409 %mask = bitcast <4 x i32> %sext to <4 x float> 1410 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2) 1411 ret <4 x float> %call 1412} 1413declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly 1414 1415define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) { 1416; X86-LABEL: test_mm256_mask_i64gather_ps: 1417; X86: # %bb.0: 1418; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1419; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 1420; X86-NEXT: vzeroupper 1421; X86-NEXT: retl 1422; 1423; X64-LABEL: test_mm256_mask_i64gather_ps: 1424; X64: # %bb.0: 1425; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0 1426; X64-NEXT: vzeroupper 1427; X64-NEXT: retq 1428 %arg1 = bitcast float *%a1 to i8* 1429 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2) 1430 ret <4 x float> %call 1431} 1432 1433define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1434; CHECK-LABEL: test0_mm256_inserti128_si256: 1435; CHECK: # %bb.0: 1436; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1437; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1438; CHECK-NEXT: ret{{[l|q]}} 1439 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1440 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1441 ret <4 x i64> %res 1442} 1443 1444define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1445; CHECK-LABEL: test1_mm256_inserti128_si256: 1446; CHECK: # %bb.0: 1447; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1448; CHECK-NEXT: ret{{[l|q]}} 1449 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1450 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1451 ret <4 x i64> %res 1452} 1453 1454define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1455; CHECK-LABEL: test_mm256_madd_epi16: 1456; CHECK: # %bb.0: 1457; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1458; CHECK-NEXT: ret{{[l|q]}} 1459 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1460 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1461 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1) 1462 %bc = bitcast <8 x i32> %res to <4 x i64> 1463 ret <4 x i64> %bc 1464} 1465declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone 1466 1467define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1468; CHECK-LABEL: test_mm256_maddubs_epi16: 1469; CHECK: # %bb.0: 1470; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 1471; CHECK-NEXT: ret{{[l|q]}} 1472 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1473 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1474 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1) 1475 %bc = bitcast <16 x i16> %res to <4 x i64> 1476 ret <4 x i64> %bc 1477} 1478declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone 1479 1480define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind { 1481; X86-LABEL: test_mm_maskload_epi32: 1482; X86: # %bb.0: 1483; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1484; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 1485; X86-NEXT: retl 1486; 1487; X64-LABEL: test_mm_maskload_epi32: 1488; X64: # %bb.0: 1489; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0 1490; X64-NEXT: retq 1491 %arg0 = bitcast i32* %a0 to i8* 1492 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1493 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1) 1494 %bc = bitcast <4 x i32> %call to <2 x i64> 1495 ret <2 x i64> %bc 1496} 1497declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly 1498 1499define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind { 1500; X86-LABEL: test_mm256_maskload_epi32: 1501; X86: # %bb.0: 1502; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1503; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 1504; X86-NEXT: retl 1505; 1506; X64-LABEL: test_mm256_maskload_epi32: 1507; X64: # %bb.0: 1508; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 1509; X64-NEXT: retq 1510 %arg0 = bitcast i32* %a0 to i8* 1511 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1512 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1) 1513 %bc = bitcast <8 x i32> %call to <4 x i64> 1514 ret <4 x i64> %bc 1515} 1516declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly 1517 1518define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind { 1519; X86-LABEL: test_mm_maskload_epi64: 1520; X86: # %bb.0: 1521; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1522; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 1523; X86-NEXT: retl 1524; 1525; X64-LABEL: test_mm_maskload_epi64: 1526; X64: # %bb.0: 1527; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0 1528; X64-NEXT: retq 1529 %arg0 = bitcast i64* %a0 to i8* 1530 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1) 1531 ret <2 x i64> %res 1532} 1533declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly 1534 1535define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind { 1536; X86-LABEL: test_mm256_maskload_epi64: 1537; X86: # %bb.0: 1538; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1539; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 1540; X86-NEXT: retl 1541; 1542; X64-LABEL: test_mm256_maskload_epi64: 1543; X64: # %bb.0: 1544; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 1545; X64-NEXT: retq 1546 %arg0 = bitcast i64* %a0 to i8* 1547 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1) 1548 ret <4 x i64> %res 1549} 1550declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly 1551 1552define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1553; X86-LABEL: test_mm_maskstore_epi32: 1554; X86: # %bb.0: 1555; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1556; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) 1557; X86-NEXT: retl 1558; 1559; X64-LABEL: test_mm_maskstore_epi32: 1560; X64: # %bb.0: 1561; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 1562; X64-NEXT: retq 1563 %arg0 = bitcast float* %a0 to i8* 1564 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1565 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1566 call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2) 1567 ret void 1568} 1569declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone 1570 1571define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1572; X86-LABEL: test_mm256_maskstore_epi32: 1573; X86: # %bb.0: 1574; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1575; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) 1576; X86-NEXT: vzeroupper 1577; X86-NEXT: retl 1578; 1579; X64-LABEL: test_mm256_maskstore_epi32: 1580; X64: # %bb.0: 1581; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) 1582; X64-NEXT: vzeroupper 1583; X64-NEXT: retq 1584 %arg0 = bitcast float* %a0 to i8* 1585 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1586 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1587 call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2) 1588 ret void 1589} 1590declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone 1591 1592define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1593; X86-LABEL: test_mm_maskstore_epi64: 1594; X86: # %bb.0: 1595; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1596; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) 1597; X86-NEXT: retl 1598; 1599; X64-LABEL: test_mm_maskstore_epi64: 1600; X64: # %bb.0: 1601; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) 1602; X64-NEXT: retq 1603 %arg0 = bitcast i64* %a0 to i8* 1604 call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2) 1605 ret void 1606} 1607declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone 1608 1609define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1610; X86-LABEL: test_mm256_maskstore_epi64: 1611; X86: # %bb.0: 1612; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1613; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) 1614; X86-NEXT: vzeroupper 1615; X86-NEXT: retl 1616; 1617; X64-LABEL: test_mm256_maskstore_epi64: 1618; X64: # %bb.0: 1619; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) 1620; X64-NEXT: vzeroupper 1621; X64-NEXT: retq 1622 %arg0 = bitcast i64* %a0 to i8* 1623 call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2) 1624 ret void 1625} 1626declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone 1627 1628define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1629; CHECK-LABEL: test_mm256_max_epi8: 1630; CHECK: # %bb.0: 1631; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 1632; CHECK-NEXT: ret{{[l|q]}} 1633 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1634 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1635 %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 1636 %bc = bitcast <32 x i8> %sel to <4 x i64> 1637 ret <4 x i64> %bc 1638} 1639declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) 1640 1641define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1642; CHECK-LABEL: test_mm256_max_epi16: 1643; CHECK: # %bb.0: 1644; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 1645; CHECK-NEXT: ret{{[l|q]}} 1646 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1647 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1648 %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 1649 %bc = bitcast <16 x i16> %sel to <4 x i64> 1650 ret <4 x i64> %bc 1651} 1652declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) 1653 1654define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1655; CHECK-LABEL: test_mm256_max_epi32: 1656; CHECK: # %bb.0: 1657; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 1658; CHECK-NEXT: ret{{[l|q]}} 1659 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1660 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1661 %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) 1662 %bc = bitcast <8 x i32> %sel to <4 x i64> 1663 ret <4 x i64> %bc 1664} 1665declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) 1666 1667define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1668; CHECK-LABEL: test_mm256_max_epu8: 1669; CHECK: # %bb.0: 1670; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 1671; CHECK-NEXT: ret{{[l|q]}} 1672 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1673 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1674 %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 1675 %bc = bitcast <32 x i8> %sel to <4 x i64> 1676 ret <4 x i64> %bc 1677} 1678declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) 1679 1680define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1681; CHECK-LABEL: test_mm256_max_epu16: 1682; CHECK: # %bb.0: 1683; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 1684; CHECK-NEXT: ret{{[l|q]}} 1685 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1686 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1687 %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 1688 %bc = bitcast <16 x i16> %sel to <4 x i64> 1689 ret <4 x i64> %bc 1690} 1691declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) 1692 1693define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1694; CHECK-LABEL: test_mm256_max_epu32: 1695; CHECK: # %bb.0: 1696; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 1697; CHECK-NEXT: ret{{[l|q]}} 1698 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1699 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1700 %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) 1701 %bc = bitcast <8 x i32> %sel to <4 x i64> 1702 ret <4 x i64> %bc 1703} 1704declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) 1705 1706define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1707; CHECK-LABEL: test_mm256_min_epi8: 1708; CHECK: # %bb.0: 1709; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 1710; CHECK-NEXT: ret{{[l|q]}} 1711 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1712 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1713 %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 1714 %bc = bitcast <32 x i8> %sel to <4 x i64> 1715 ret <4 x i64> %bc 1716} 1717declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) 1718 1719define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1720; CHECK-LABEL: test_mm256_min_epi16: 1721; CHECK: # %bb.0: 1722; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 1723; CHECK-NEXT: ret{{[l|q]}} 1724 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1725 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1726 %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 1727 %bc = bitcast <16 x i16> %sel to <4 x i64> 1728 ret <4 x i64> %bc 1729} 1730declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) 1731 1732define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1733; CHECK-LABEL: test_mm256_min_epi32: 1734; CHECK: # %bb.0: 1735; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 1736; CHECK-NEXT: ret{{[l|q]}} 1737 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1738 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1739 %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) 1740 %bc = bitcast <8 x i32> %sel to <4 x i64> 1741 ret <4 x i64> %bc 1742} 1743declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) 1744 1745define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1746; CHECK-LABEL: test_mm256_min_epu8: 1747; CHECK: # %bb.0: 1748; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 1749; CHECK-NEXT: ret{{[l|q]}} 1750 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1751 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1752 %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 1753 %bc = bitcast <32 x i8> %sel to <4 x i64> 1754 ret <4 x i64> %bc 1755} 1756declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) 1757 1758define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1759; CHECK-LABEL: test_mm256_min_epu16: 1760; CHECK: # %bb.0: 1761; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 1762; CHECK-NEXT: ret{{[l|q]}} 1763 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1764 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1765 %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 1766 %bc = bitcast <16 x i16> %sel to <4 x i64> 1767 ret <4 x i64> %bc 1768} 1769declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) 1770 1771define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1772; CHECK-LABEL: test_mm256_min_epu32: 1773; CHECK: # %bb.0: 1774; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 1775; CHECK-NEXT: ret{{[l|q]}} 1776 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1777 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1778 %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) 1779 %bc = bitcast <8 x i32> %sel to <4 x i64> 1780 ret <4 x i64> %bc 1781} 1782declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) 1783 1784define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind { 1785; CHECK-LABEL: test_mm256_movemask_epi8: 1786; CHECK: # %bb.0: 1787; CHECK-NEXT: vpmovmskb %ymm0, %eax 1788; CHECK-NEXT: vzeroupper 1789; CHECK-NEXT: ret{{[l|q]}} 1790 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1791 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0) 1792 ret i32 %res 1793} 1794declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone 1795 1796define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1797; CHECK-LABEL: test_mm256_mpsadbw_epu8: 1798; CHECK: # %bb.0: 1799; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0 1800; CHECK-NEXT: ret{{[l|q]}} 1801 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1802 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1803 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3) 1804 %bc = bitcast <16 x i16> %call to <4 x i64> 1805 ret <4 x i64> %bc 1806} 1807declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone 1808 1809define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1810; CHECK-LABEL: test_mm256_mul_epi32: 1811; CHECK: # %bb.0: 1812; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 1813; CHECK-NEXT: ret{{[l|q]}} 1814 %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32> 1815 %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32> 1816 %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32> 1817 %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32> 1818 %res = mul nsw <4 x i64> %A1, %B1 1819 ret <4 x i64> %res 1820} 1821declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone 1822 1823define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1824; CHECK-LABEL: test_mm256_mul_epu32: 1825; CHECK: # %bb.0: 1826; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1827; CHECK-NEXT: ret{{[l|q]}} 1828 %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1829 %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1830 %res = mul nuw <4 x i64> %A, %B 1831 ret <4 x i64> %res 1832} 1833declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone 1834 1835define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1836; CHECK-LABEL: test_mm256_mulhi_epi16: 1837; CHECK: # %bb.0: 1838; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 1839; CHECK-NEXT: ret{{[l|q]}} 1840 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1841 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1842 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1) 1843 %bc = bitcast <16 x i16> %res to <4 x i64> 1844 ret <4 x i64> %bc 1845} 1846declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone 1847 1848define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1849; CHECK-LABEL: test_mm256_mulhi_epu16: 1850; CHECK: # %bb.0: 1851; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 1852; CHECK-NEXT: ret{{[l|q]}} 1853 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1854 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1855 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1) 1856 %bc = bitcast <16 x i16> %res to <4 x i64> 1857 ret <4 x i64> %bc 1858} 1859declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone 1860 1861define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1862; CHECK-LABEL: test_mm256_mulhrs_epi16: 1863; CHECK: # %bb.0: 1864; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 1865; CHECK-NEXT: ret{{[l|q]}} 1866 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1867 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1868 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1) 1869 %bc = bitcast <16 x i16> %res to <4 x i64> 1870 ret <4 x i64> %bc 1871} 1872declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone 1873 1874define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1875; CHECK-LABEL: test_mm256_mullo_epi16: 1876; CHECK: # %bb.0: 1877; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1878; CHECK-NEXT: ret{{[l|q]}} 1879 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1880 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1881 %res = mul <16 x i16> %arg0, %arg1 1882 %bc = bitcast <16 x i16> %res to <4 x i64> 1883 ret <4 x i64> %bc 1884} 1885 1886define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1887; CHECK-LABEL: test_mm256_mullo_epi32: 1888; CHECK: # %bb.0: 1889; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1890; CHECK-NEXT: ret{{[l|q]}} 1891 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1892 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1893 %res = mul <8 x i32> %arg0, %arg1 1894 %bc = bitcast <8 x i32> %res to <4 x i64> 1895 ret <4 x i64> %bc 1896} 1897 1898define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1899; CHECK-LABEL: test_mm256_or_si256: 1900; CHECK: # %bb.0: 1901; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1902; CHECK-NEXT: ret{{[l|q]}} 1903 %res = or <4 x i64> %a0, %a1 1904 ret <4 x i64> %res 1905} 1906 1907define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1908; CHECK-LABEL: test_mm256_packs_epi16: 1909; CHECK: # %bb.0: 1910; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 1911; CHECK-NEXT: ret{{[l|q]}} 1912 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1913 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1914 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1) 1915 %res = bitcast <32 x i8> %call to <4 x i64> 1916 ret <4 x i64> %res 1917} 1918declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 1919 1920define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1921; CHECK-LABEL: test_mm256_packs_epi32: 1922; CHECK: # %bb.0: 1923; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 1924; CHECK-NEXT: ret{{[l|q]}} 1925 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1926 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1927 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1) 1928 %res = bitcast <16 x i16> %call to <4 x i64> 1929 ret <4 x i64> %res 1930} 1931declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 1932 1933define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1934; CHECK-LABEL: test_mm256_packus_epi16: 1935; CHECK: # %bb.0: 1936; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1937; CHECK-NEXT: ret{{[l|q]}} 1938 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1939 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1940 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1) 1941 %res = bitcast <32 x i8> %call to <4 x i64> 1942 ret <4 x i64> %res 1943} 1944declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 1945 1946define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1947; CHECK-LABEL: test_mm256_packus_epi32: 1948; CHECK: # %bb.0: 1949; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1950; CHECK-NEXT: ret{{[l|q]}} 1951 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1952 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1953 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1) 1954 %res = bitcast <16 x i16> %call to <4 x i64> 1955 ret <4 x i64> %res 1956} 1957declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 1958 1959define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) { 1960; CHECK-LABEL: test_mm256_permute2x128_si256: 1961; CHECK: # %bb.0: 1962; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1963; CHECK-NEXT: ret{{[l|q]}} 1964 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1965 ret <4 x i64> %res 1966} 1967declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly 1968 1969define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) { 1970; CHECK-LABEL: test_mm256_permute4x64_epi64: 1971; CHECK: # %bb.0: 1972; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0] 1973; CHECK-NEXT: ret{{[l|q]}} 1974 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0> 1975 ret <4 x i64> %res 1976} 1977 1978define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) { 1979; CHECK-LABEL: test_mm256_permute4x64_pd: 1980; CHECK: # %bb.0: 1981; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0] 1982; CHECK-NEXT: ret{{[l|q]}} 1983 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 1984 ret <4 x double> %res 1985} 1986 1987define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1988; CHECK-LABEL: test_mm256_permutevar8x32_epi32: 1989; CHECK: # %bb.0: 1990; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 1991; CHECK-NEXT: ret{{[l|q]}} 1992 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1993 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1994 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1) 1995 %res = bitcast <8 x i32> %call to <4 x i64> 1996 ret <4 x i64> %res 1997} 1998declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 1999 2000define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) { 2001; CHECK-LABEL: test_mm256_permutevar8x32_ps: 2002; CHECK: # %bb.0: 2003; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 2004; CHECK-NEXT: ret{{[l|q]}} 2005 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2006 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1) 2007 ret <8 x float> %res 2008} 2009declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly 2010 2011define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2012; CHECK-LABEL: test_mm256_sad_epu8: 2013; CHECK: # %bb.0: 2014; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 2015; CHECK-NEXT: ret{{[l|q]}} 2016 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2017 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2018 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1) 2019 ret <4 x i64> %res 2020} 2021declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone 2022 2023define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { 2024; CHECK-LABEL: test_mm256_shuffle_epi32: 2025; CHECK: # %bb.0: 2026; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] 2027; CHECK-NEXT: ret{{[l|q]}} 2028 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2029 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4> 2030 %res = bitcast <8 x i32> %shuf to <4 x i64> 2031 ret <4 x i64> %res 2032} 2033 2034define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2035; CHECK-LABEL: test_mm256_shuffle_epi8: 2036; CHECK: # %bb.0: 2037; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0 2038; CHECK-NEXT: ret{{[l|q]}} 2039 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2040 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2041 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1) 2042 %res = bitcast <32 x i8> %shuf to <4 x i64> 2043 ret <4 x i64> %res 2044} 2045declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone 2046 2047define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) { 2048; CHECK-LABEL: test_mm256_shufflehi_epi16: 2049; CHECK: # %bb.0: 2050; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13] 2051; CHECK-NEXT: ret{{[l|q]}} 2052 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2053 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13> 2054 %res = bitcast <16 x i16> %shuf to <4 x i64> 2055 ret <4 x i64> %res 2056} 2057 2058define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) { 2059; CHECK-LABEL: test_mm256_shufflelo_epi16: 2060; CHECK: # %bb.0: 2061; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15] 2062; CHECK-NEXT: ret{{[l|q]}} 2063 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2064 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15> 2065 %res = bitcast <16 x i16> %shuf to <4 x i64> 2066 ret <4 x i64> %res 2067} 2068 2069define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2070; CHECK-LABEL: test_mm256_sign_epi8: 2071; CHECK: # %bb.0: 2072; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0 2073; CHECK-NEXT: ret{{[l|q]}} 2074 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2075 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2076 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1) 2077 %res = bitcast <32 x i8> %call to <4 x i64> 2078 ret <4 x i64> %res 2079} 2080declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone 2081 2082define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2083; CHECK-LABEL: test_mm256_sign_epi16: 2084; CHECK: # %bb.0: 2085; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 2086; CHECK-NEXT: ret{{[l|q]}} 2087 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2088 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2089 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1) 2090 %res = bitcast <16 x i16> %call to <4 x i64> 2091 ret <4 x i64> %res 2092} 2093declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone 2094 2095define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2096; CHECK-LABEL: test_mm256_sign_epi32: 2097; CHECK: # %bb.0: 2098; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0 2099; CHECK-NEXT: ret{{[l|q]}} 2100 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2101 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2102 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1) 2103 %res = bitcast <8 x i32> %call to <4 x i64> 2104 ret <4 x i64> %res 2105} 2106declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone 2107 2108define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2109; CHECK-LABEL: test_mm256_sll_epi16: 2110; CHECK: # %bb.0: 2111; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 2112; CHECK-NEXT: ret{{[l|q]}} 2113 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2114 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2115 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1) 2116 %bc = bitcast <16 x i16> %res to <4 x i64> 2117 ret <4 x i64> %bc 2118} 2119declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone 2120 2121define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2122; CHECK-LABEL: test_mm256_sll_epi32: 2123; CHECK: # %bb.0: 2124; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 2125; CHECK-NEXT: ret{{[l|q]}} 2126 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2127 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2128 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1) 2129 %bc = bitcast <8 x i32> %res to <4 x i64> 2130 ret <4 x i64> %bc 2131} 2132declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone 2133 2134define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2135; CHECK-LABEL: test_mm256_sll_epi64: 2136; CHECK: # %bb.0: 2137; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 2138; CHECK-NEXT: ret{{[l|q]}} 2139 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) 2140 ret <4 x i64> %res 2141} 2142declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone 2143 2144define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) { 2145; CHECK-LABEL: test_mm256_slli_epi16: 2146; CHECK: # %bb.0: 2147; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 2148; CHECK-NEXT: ret{{[l|q]}} 2149 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2150 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3) 2151 %bc = bitcast <16 x i16> %res to <4 x i64> 2152 ret <4 x i64> %bc 2153} 2154declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone 2155 2156define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) { 2157; CHECK-LABEL: test_mm256_slli_epi32: 2158; CHECK: # %bb.0: 2159; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 2160; CHECK-NEXT: ret{{[l|q]}} 2161 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2162 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3) 2163 %bc = bitcast <8 x i32> %res to <4 x i64> 2164 ret <4 x i64> %bc 2165} 2166declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone 2167 2168define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) { 2169; CHECK-LABEL: test_mm256_slli_epi64: 2170; CHECK: # %bb.0: 2171; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0 2172; CHECK-NEXT: ret{{[l|q]}} 2173 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3) 2174 ret <4 x i64> %res 2175} 2176declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone 2177 2178define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) { 2179; CHECK-LABEL: test_mm256_slli_si256: 2180; CHECK: # %bb.0: 2181; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 2182; CHECK-NEXT: ret{{[l|q]}} 2183 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2184 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 2185 %res = bitcast <32 x i8> %shuf to <4 x i64> 2186 ret <4 x i64> %res 2187} 2188 2189define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2190; CHECK-LABEL: test_mm_sllv_epi32: 2191; CHECK: # %bb.0: 2192; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 2193; CHECK-NEXT: ret{{[l|q]}} 2194 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2195 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2196 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2197 %bc = bitcast <4 x i32> %res to <2 x i64> 2198 ret <2 x i64> %bc 2199} 2200declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone 2201 2202define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2203; CHECK-LABEL: test_mm256_sllv_epi32: 2204; CHECK: # %bb.0: 2205; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 2206; CHECK-NEXT: ret{{[l|q]}} 2207 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2208 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2209 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2210 %bc = bitcast <8 x i32> %res to <4 x i64> 2211 ret <4 x i64> %bc 2212} 2213declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2214 2215define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2216; CHECK-LABEL: test_mm_sllv_epi64: 2217; CHECK: # %bb.0: 2218; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 2219; CHECK-NEXT: ret{{[l|q]}} 2220 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) 2221 ret <2 x i64> %res 2222} 2223declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone 2224 2225define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2226; CHECK-LABEL: test_mm256_sllv_epi64: 2227; CHECK: # %bb.0: 2228; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 2229; CHECK-NEXT: ret{{[l|q]}} 2230 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2231 ret <4 x i64> %res 2232} 2233declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2234 2235define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2236; CHECK-LABEL: test_mm256_sra_epi16: 2237; CHECK: # %bb.0: 2238; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 2239; CHECK-NEXT: ret{{[l|q]}} 2240 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2241 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2242 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1) 2243 %bc = bitcast <16 x i16> %res to <4 x i64> 2244 ret <4 x i64> %bc 2245} 2246declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone 2247 2248define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2249; CHECK-LABEL: test_mm256_sra_epi32: 2250; CHECK: # %bb.0: 2251; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 2252; CHECK-NEXT: ret{{[l|q]}} 2253 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2254 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2255 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1) 2256 %bc = bitcast <8 x i32> %res to <4 x i64> 2257 ret <4 x i64> %bc 2258} 2259declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone 2260 2261define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) { 2262; CHECK-LABEL: test_mm256_srai_epi16: 2263; CHECK: # %bb.0: 2264; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 2265; CHECK-NEXT: ret{{[l|q]}} 2266 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2267 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3) 2268 %bc = bitcast <16 x i16> %res to <4 x i64> 2269 ret <4 x i64> %bc 2270} 2271declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone 2272 2273define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) { 2274; CHECK-LABEL: test_mm256_srai_epi32: 2275; CHECK: # %bb.0: 2276; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0 2277; CHECK-NEXT: ret{{[l|q]}} 2278 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2279 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3) 2280 %bc = bitcast <8 x i32> %res to <4 x i64> 2281 ret <4 x i64> %bc 2282} 2283declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone 2284 2285define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2286; CHECK-LABEL: test_mm_srav_epi32: 2287; CHECK: # %bb.0: 2288; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0 2289; CHECK-NEXT: ret{{[l|q]}} 2290 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2291 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2292 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1) 2293 %bc = bitcast <4 x i32> %res to <2 x i64> 2294 ret <2 x i64> %bc 2295} 2296declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone 2297 2298define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2299; CHECK-LABEL: test_mm256_srav_epi32: 2300; CHECK: # %bb.0: 2301; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 2302; CHECK-NEXT: ret{{[l|q]}} 2303 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2304 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2305 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2306 %bc = bitcast <8 x i32> %res to <4 x i64> 2307 ret <4 x i64> %bc 2308} 2309declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2310 2311define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2312; CHECK-LABEL: test_mm256_srl_epi16: 2313; CHECK: # %bb.0: 2314; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 2315; CHECK-NEXT: ret{{[l|q]}} 2316 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2317 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2318 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1) 2319 %bc = bitcast <16 x i16> %res to <4 x i64> 2320 ret <4 x i64> %bc 2321} 2322declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone 2323 2324define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2325; CHECK-LABEL: test_mm256_srl_epi32: 2326; CHECK: # %bb.0: 2327; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 2328; CHECK-NEXT: ret{{[l|q]}} 2329 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2330 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2331 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1) 2332 %bc = bitcast <8 x i32> %res to <4 x i64> 2333 ret <4 x i64> %bc 2334} 2335declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone 2336 2337define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2338; CHECK-LABEL: test_mm256_srl_epi64: 2339; CHECK: # %bb.0: 2340; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2341; CHECK-NEXT: ret{{[l|q]}} 2342 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) 2343 ret <4 x i64> %res 2344} 2345declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone 2346 2347define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) { 2348; CHECK-LABEL: test_mm256_srli_epi16: 2349; CHECK: # %bb.0: 2350; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 2351; CHECK-NEXT: ret{{[l|q]}} 2352 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2353 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3) 2354 %bc = bitcast <16 x i16> %res to <4 x i64> 2355 ret <4 x i64> %bc 2356} 2357declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone 2358 2359define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) { 2360; CHECK-LABEL: test_mm256_srli_epi32: 2361; CHECK: # %bb.0: 2362; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 2363; CHECK-NEXT: ret{{[l|q]}} 2364 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2365 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3) 2366 %bc = bitcast <8 x i32> %res to <4 x i64> 2367 ret <4 x i64> %bc 2368} 2369declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone 2370 2371define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) { 2372; CHECK-LABEL: test_mm256_srli_epi64: 2373; CHECK: # %bb.0: 2374; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 2375; CHECK-NEXT: ret{{[l|q]}} 2376 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3) 2377 ret <4 x i64> %res 2378} 2379declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone 2380 2381define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) { 2382; CHECK-LABEL: test_mm256_srli_si256: 2383; CHECK: # %bb.0: 2384; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 2385; CHECK-NEXT: ret{{[l|q]}} 2386 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2387 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 2388 %res = bitcast <32 x i8> %shuf to <4 x i64> 2389 ret <4 x i64> %res 2390} 2391 2392define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2393; CHECK-LABEL: test_mm_srlv_epi32: 2394; CHECK: # %bb.0: 2395; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 2396; CHECK-NEXT: ret{{[l|q]}} 2397 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2398 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2399 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2400 %bc = bitcast <4 x i32> %res to <2 x i64> 2401 ret <2 x i64> %bc 2402} 2403declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone 2404 2405define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2406; CHECK-LABEL: test_mm256_srlv_epi32: 2407; CHECK: # %bb.0: 2408; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 2409; CHECK-NEXT: ret{{[l|q]}} 2410 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2411 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2412 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2413 %bc = bitcast <8 x i32> %res to <4 x i64> 2414 ret <4 x i64> %bc 2415} 2416declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2417 2418define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2419; CHECK-LABEL: test_mm_srlv_epi64: 2420; CHECK: # %bb.0: 2421; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 2422; CHECK-NEXT: ret{{[l|q]}} 2423 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) 2424 ret <2 x i64> %res 2425} 2426declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone 2427 2428define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2429; CHECK-LABEL: test_mm256_srlv_epi64: 2430; CHECK: # %bb.0: 2431; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 2432; CHECK-NEXT: ret{{[l|q]}} 2433 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2434 ret <4 x i64> %res 2435} 2436declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2437 2438define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) { 2439; X86-LABEL: test_mm256_stream_load_si256: 2440; X86: # %bb.0: 2441; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2442; X86-NEXT: vmovntdqa (%eax), %ymm0 2443; X86-NEXT: retl 2444; 2445; X64-LABEL: test_mm256_stream_load_si256: 2446; X64: # %bb.0: 2447; X64-NEXT: vmovntdqa (%rdi), %ymm0 2448; X64-NEXT: retq 2449 %arg0 = bitcast <4 x i64> *%a0 to i8* 2450 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0) 2451 ret <4 x i64> %res 2452} 2453declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly 2454 2455define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2456; CHECK-LABEL: test_mm256_sub_epi8: 2457; CHECK: # %bb.0: 2458; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2459; CHECK-NEXT: ret{{[l|q]}} 2460 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2461 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2462 %res = sub <32 x i8> %arg0, %arg1 2463 %bc = bitcast <32 x i8> %res to <4 x i64> 2464 ret <4 x i64> %bc 2465} 2466 2467define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2468; CHECK-LABEL: test_mm256_sub_epi16: 2469; CHECK: # %bb.0: 2470; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 2471; CHECK-NEXT: ret{{[l|q]}} 2472 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2473 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2474 %res = sub <16 x i16> %arg0, %arg1 2475 %bc = bitcast <16 x i16> %res to <4 x i64> 2476 ret <4 x i64> %bc 2477} 2478 2479define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2480; CHECK-LABEL: test_mm256_sub_epi32: 2481; CHECK: # %bb.0: 2482; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2483; CHECK-NEXT: ret{{[l|q]}} 2484 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2485 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2486 %res = sub <8 x i32> %arg0, %arg1 2487 %bc = bitcast <8 x i32> %res to <4 x i64> 2488 ret <4 x i64> %bc 2489} 2490 2491define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2492; CHECK-LABEL: test_mm256_sub_epi64: 2493; CHECK: # %bb.0: 2494; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 2495; CHECK-NEXT: ret{{[l|q]}} 2496 %res = sub <4 x i64> %a0, %a1 2497 ret <4 x i64> %res 2498} 2499 2500define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2501; CHECK-LABEL: test_mm256_subs_epi8: 2502; CHECK: # %bb.0: 2503; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 2504; CHECK-NEXT: ret{{[l|q]}} 2505 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2506 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2507 %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 2508 %bc = bitcast <32 x i8> %res to <4 x i64> 2509 ret <4 x i64> %bc 2510} 2511declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 2512 2513define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2514; CHECK-LABEL: test_mm256_subs_epi16: 2515; CHECK: # %bb.0: 2516; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 2517; CHECK-NEXT: ret{{[l|q]}} 2518 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2519 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2520 %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 2521 %bc = bitcast <16 x i16> %res to <4 x i64> 2522 ret <4 x i64> %bc 2523} 2524declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 2525 2526define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2527; CHECK-LABEL: test_mm256_subs_epu8: 2528; CHECK: # %bb.0: 2529; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 2530; CHECK-NEXT: ret{{[l|q]}} 2531 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2532 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2533 %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 2534 %bc = bitcast <32 x i8> %res to <4 x i64> 2535 ret <4 x i64> %bc 2536} 2537declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) 2538 2539define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { 2540; CHECK-LABEL: test_mm256_subs_epu16: 2541; CHECK: # %bb.0: 2542; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 2543; CHECK-NEXT: ret{{[l|q]}} 2544 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2545 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2546 %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 2547 %bc = bitcast <16 x i16> %res to <4 x i64> 2548 ret <4 x i64> %bc 2549} 2550declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) 2551 2552define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2553; CHECK-LABEL: test_mm256_unpackhi_epi8: 2554; CHECK: # %bb.0: 2555; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 2556; CHECK-NEXT: ret{{[l|q]}} 2557 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2558 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2559 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 2560 %bc = bitcast <32 x i8> %res to <4 x i64> 2561 ret <4 x i64> %bc 2562} 2563 2564define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2565; CHECK-LABEL: test_mm256_unpackhi_epi16: 2566; CHECK: # %bb.0: 2567; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 2568; CHECK-NEXT: ret{{[l|q]}} 2569 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2570 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2571 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 2572 %bc = bitcast <16 x i16> %res to <4 x i64> 2573 ret <4 x i64> %bc 2574} 2575 2576define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2577; CHECK-LABEL: test_mm256_unpackhi_epi32: 2578; CHECK: # %bb.0: 2579; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 2580; CHECK-NEXT: ret{{[l|q]}} 2581 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2582 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2583 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 2584 %bc = bitcast <8 x i32> %res to <4 x i64> 2585 ret <4 x i64> %bc 2586} 2587 2588define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2589; CHECK-LABEL: test_mm256_unpackhi_epi64: 2590; CHECK: # %bb.0: 2591; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 2592; CHECK-NEXT: ret{{[l|q]}} 2593 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 2594 ret <4 x i64> %res 2595} 2596 2597define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2598; CHECK-LABEL: test_mm256_unpacklo_epi8: 2599; CHECK: # %bb.0: 2600; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2601; CHECK-NEXT: ret{{[l|q]}} 2602 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2603 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2604 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 2605 %bc = bitcast <32 x i8> %res to <4 x i64> 2606 ret <4 x i64> %bc 2607} 2608 2609define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2610; CHECK-LABEL: test_mm256_unpacklo_epi16: 2611; CHECK: # %bb.0: 2612; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 2613; CHECK-NEXT: ret{{[l|q]}} 2614 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2615 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2616 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 2617 %bc = bitcast <16 x i16> %res to <4 x i64> 2618 ret <4 x i64> %bc 2619} 2620 2621define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2622; CHECK-LABEL: test_mm256_unpacklo_epi32: 2623; CHECK: # %bb.0: 2624; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 2625; CHECK-NEXT: ret{{[l|q]}} 2626 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2627 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2628 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 2629 %bc = bitcast <8 x i32> %res to <4 x i64> 2630 ret <4 x i64> %bc 2631} 2632 2633define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2634; CHECK-LABEL: test_mm256_unpacklo_epi64: 2635; CHECK: # %bb.0: 2636; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2637; CHECK-NEXT: ret{{[l|q]}} 2638 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2639 ret <4 x i64> %res 2640} 2641 2642define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2643; CHECK-LABEL: test_mm256_xor_si256: 2644; CHECK: # %bb.0: 2645; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 2646; CHECK-NEXT: ret{{[l|q]}} 2647 %res = xor <4 x i64> %a0, %a1 2648 ret <4 x i64> %res 2649} 2650 2651declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 2652 2653declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 2654