1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s 2 3; 256-bit 4 5define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) { 6; CHECK-LABEL: test_pcmpeq_d_256 7; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ## 8 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) 9 ret i8 %res 10} 11 12define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { 13; CHECK-LABEL: test_mask_pcmpeq_d_256 14; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## 15 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) 16 ret i8 %res 17} 18 19declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8) 20 21define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) { 22; CHECK-LABEL: test_pcmpeq_q_256 23; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ## 24 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) 25 ret i8 %res 26} 27 28define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { 29; CHECK-LABEL: test_mask_pcmpeq_q_256 30; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## 31 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) 32 ret i8 %res 33} 34 35declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8) 36 37define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) { 38; CHECK-LABEL: test_pcmpgt_d_256 39; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ## 40 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) 41 ret i8 %res 42} 43 44define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { 45; CHECK-LABEL: test_mask_pcmpgt_d_256 46; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## 47 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) 48 ret i8 %res 49} 50 51declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8) 52 53define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) { 54; CHECK-LABEL: test_pcmpgt_q_256 55; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ## 56 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) 57 ret i8 %res 58} 59 60define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { 61; CHECK-LABEL: test_mask_pcmpgt_q_256 62; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## 63 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) 64 ret i8 %res 65} 66 67declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8) 68 69define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { 70; CHECK-LABEL: test_cmp_d_256 71; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ## 72 %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 -1) 73 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 74; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ## 75 %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 -1) 76 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 77; CHECK: vpcmpled %ymm1, %ymm0, %k0 ## 78 %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 -1) 79 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 80; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ## 81 %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 -1) 82 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 83; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ## 84 %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 -1) 85 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 86; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ## 87 %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 -1) 88 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 89; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ## 90 %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 -1) 91 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 92; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ## 93 %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 -1) 94 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 95 ret <8 x i8> %vec7 96} 97 98define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { 99; CHECK-LABEL: test_mask_cmp_d_256 100; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## 101 %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 %mask) 102 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 103; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ## 104 %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 %mask) 105 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 106; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ## 107 %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 %mask) 108 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 109; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ## 110 %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 %mask) 111 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 112; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ## 113 %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 %mask) 114 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 115; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ## 116 %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 %mask) 117 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 118; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ## 119 %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 %mask) 120 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 121; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ## 122 %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 %mask) 123 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 124 ret <8 x i8> %vec7 125} 126 127declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i8, i8) nounwind readnone 128 129define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { 130; CHECK-LABEL: test_ucmp_d_256 131; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ## 132 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 -1) 133 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 134; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ## 135 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 -1) 136 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 137; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ## 138 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 -1) 139 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 140; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ## 141 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 -1) 142 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 143; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ## 144 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 -1) 145 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 146; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ## 147 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 -1) 148 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 149; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ## 150 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 -1) 151 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 152; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ## 153 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 -1) 154 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 155 ret <8 x i8> %vec7 156} 157 158define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { 159; CHECK-LABEL: test_mask_ucmp_d_256 160; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## 161 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 %mask) 162 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 163; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ## 164 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 %mask) 165 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 166; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ## 167 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 %mask) 168 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 169; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ## 170 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 %mask) 171 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 172; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ## 173 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 %mask) 174 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 175; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ## 176 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 %mask) 177 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 178; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ## 179 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 %mask) 180 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 181; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ## 182 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 %mask) 183 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 184 ret <8 x i8> %vec7 185} 186 187declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i8, i8) nounwind readnone 188 189define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { 190; CHECK-LABEL: test_cmp_q_256 191; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ## 192 %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 -1) 193 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 194; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ## 195 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 -1) 196 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 197; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ## 198 %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 -1) 199 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 200; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ## 201 %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 -1) 202 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 203; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ## 204 %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 -1) 205 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 206; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ## 207 %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 -1) 208 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 209; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ## 210 %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 -1) 211 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 212; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ## 213 %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 -1) 214 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 215 ret <8 x i8> %vec7 216} 217 218define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { 219; CHECK-LABEL: test_mask_cmp_q_256 220; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## 221 %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 %mask) 222 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 223; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ## 224 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 %mask) 225 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 226; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ## 227 %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 %mask) 228 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 229; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ## 230 %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 %mask) 231 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 232; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ## 233 %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 %mask) 234 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 235; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ## 236 %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 %mask) 237 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 238; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ## 239 %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 %mask) 240 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 241; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ## 242 %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 %mask) 243 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 244 ret <8 x i8> %vec7 245} 246 247declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i8, i8) nounwind readnone 248 249define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { 250; CHECK-LABEL: test_ucmp_q_256 251; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ## 252 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 -1) 253 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 254; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ## 255 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 -1) 256 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 257; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ## 258 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 -1) 259 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 260; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ## 261 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 -1) 262 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 263; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ## 264 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 -1) 265 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 266; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ## 267 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 -1) 268 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 269; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ## 270 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 -1) 271 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 272; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ## 273 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 -1) 274 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 275 ret <8 x i8> %vec7 276} 277 278define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { 279; CHECK-LABEL: test_mask_ucmp_q_256 280; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## 281 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 %mask) 282 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 283; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ## 284 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 %mask) 285 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 286; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ## 287 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 %mask) 288 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 289; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ## 290 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 %mask) 291 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 292; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ## 293 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 %mask) 294 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 295; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ## 296 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 %mask) 297 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 298; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ## 299 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 %mask) 300 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 301; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ## 302 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 %mask) 303 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 304 ret <8 x i8> %vec7 305} 306 307declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i8, i8) nounwind readnone 308 309; 128-bit 310 311define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) { 312; CHECK-LABEL: test_pcmpeq_d_128 313; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ## 314 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) 315 ret i8 %res 316} 317 318define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { 319; CHECK-LABEL: test_mask_pcmpeq_d_128 320; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## 321 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) 322 ret i8 %res 323} 324 325declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8) 326 327define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) { 328; CHECK-LABEL: test_pcmpeq_q_128 329; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ## 330 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) 331 ret i8 %res 332} 333 334define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { 335; CHECK-LABEL: test_mask_pcmpeq_q_128 336; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## 337 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) 338 ret i8 %res 339} 340 341declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8) 342 343define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) { 344; CHECK-LABEL: test_pcmpgt_d_128 345; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ## 346 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) 347 ret i8 %res 348} 349 350define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { 351; CHECK-LABEL: test_mask_pcmpgt_d_128 352; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## 353 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) 354 ret i8 %res 355} 356 357declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8) 358 359define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) { 360; CHECK-LABEL: test_pcmpgt_q_128 361; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ## 362 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) 363 ret i8 %res 364} 365 366define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { 367; CHECK-LABEL: test_mask_pcmpgt_q_128 368; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## 369 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) 370 ret i8 %res 371} 372 373declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8) 374 375define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { 376; CHECK-LABEL: test_cmp_d_128 377; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ## 378 %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 -1) 379 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 380; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ## 381 %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 -1) 382 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 383; CHECK: vpcmpled %xmm1, %xmm0, %k0 ## 384 %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 -1) 385 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 386; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ## 387 %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 -1) 388 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 389; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ## 390 %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 -1) 391 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 392; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ## 393 %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 -1) 394 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 395; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ## 396 %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 -1) 397 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 398; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ## 399 %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 -1) 400 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 401 ret <8 x i8> %vec7 402} 403 404define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { 405; CHECK-LABEL: test_mask_cmp_d_128 406; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## 407 %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 %mask) 408 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 409; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ## 410 %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 %mask) 411 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 412; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ## 413 %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 %mask) 414 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 415; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ## 416 %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 %mask) 417 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 418; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ## 419 %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 %mask) 420 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 421; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ## 422 %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 %mask) 423 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 424; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ## 425 %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 %mask) 426 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 427; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ## 428 %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 %mask) 429 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 430 ret <8 x i8> %vec7 431} 432 433declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i8, i8) nounwind readnone 434 435define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { 436; CHECK-LABEL: test_ucmp_d_128 437; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ## 438 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 -1) 439 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 440; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ## 441 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 -1) 442 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 443; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ## 444 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 -1) 445 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 446; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ## 447 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 -1) 448 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 449; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ## 450 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 -1) 451 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 452; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ## 453 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 -1) 454 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 455; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ## 456 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 -1) 457 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 458; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ## 459 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 -1) 460 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 461 ret <8 x i8> %vec7 462} 463 464define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { 465; CHECK-LABEL: test_mask_ucmp_d_128 466; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## 467 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 %mask) 468 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 469; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ## 470 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 %mask) 471 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 472; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ## 473 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 %mask) 474 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 475; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ## 476 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 %mask) 477 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 478; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ## 479 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 %mask) 480 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 481; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ## 482 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 %mask) 483 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 484; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ## 485 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 %mask) 486 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 487; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ## 488 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 %mask) 489 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 490 ret <8 x i8> %vec7 491} 492 493declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i8, i8) nounwind readnone 494 495define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { 496; CHECK-LABEL: test_cmp_q_128 497; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ## 498 %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 -1) 499 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 500; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ## 501 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 -1) 502 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 503; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ## 504 %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 -1) 505 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 506; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ## 507 %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 -1) 508 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 509; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ## 510 %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 -1) 511 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 512; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ## 513 %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 -1) 514 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 515; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ## 516 %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 -1) 517 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 518; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ## 519 %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 -1) 520 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 521 ret <8 x i8> %vec7 522} 523 524define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { 525; CHECK-LABEL: test_mask_cmp_q_128 526; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## 527 %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 %mask) 528 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 529; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ## 530 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 %mask) 531 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 532; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ## 533 %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 %mask) 534 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 535; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ## 536 %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 %mask) 537 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 538; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ## 539 %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 %mask) 540 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 541; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ## 542 %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 %mask) 543 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 544; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ## 545 %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 %mask) 546 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 547; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ## 548 %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 %mask) 549 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 550 ret <8 x i8> %vec7 551} 552 553declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone 554 555define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { 556; CHECK-LABEL: test_ucmp_q_128 557; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ## 558 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 -1) 559 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 560; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ## 561 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 -1) 562 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 563; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ## 564 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 -1) 565 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 566; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ## 567 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 -1) 568 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 569; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ## 570 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 -1) 571 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 572; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ## 573 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 -1) 574 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 575; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ## 576 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 -1) 577 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 578; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ## 579 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 -1) 580 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 581 ret <8 x i8> %vec7 582} 583 584define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { 585; CHECK-LABEL: test_mask_ucmp_q_128 586; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## 587 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 %mask) 588 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 589; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ## 590 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 %mask) 591 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 592; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ## 593 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 %mask) 594 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 595; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ## 596 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 %mask) 597 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 598; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ## 599 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 %mask) 600 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 601; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ## 602 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 %mask) 603 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 604; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ## 605 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 %mask) 606 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 607; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ## 608 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 %mask) 609 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 610 ret <8 x i8> %vec7 611} 612 613declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone 614 615; CHECK-LABEL: compr1 616; CHECK: vcompresspd %zmm0 617define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) { 618 call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask) 619 ret void 620} 621 622declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask) 623 624; CHECK-LABEL: compr2 625; CHECK: vcompresspd %ymm0 626define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) { 627 call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask) 628 ret void 629} 630 631declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask) 632 633; CHECK-LABEL: compr3 634; CHECK: vcompressps %xmm0 635define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) { 636 call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask) 637 ret void 638} 639 640declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask) 641 642; CHECK-LABEL: compr4 643; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0] 644define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) { 645 %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) 646 ret <8 x double> %res 647} 648 649declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) 650 651; CHECK-LABEL: compr5 652; CHECK: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1] 653define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) { 654 %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask) 655 ret <4 x double> %res 656} 657 658declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask) 659 660; CHECK-LABEL: compr6 661; CHECK: vcompressps %xmm0 662define <4 x float> @compr6(<4 x float> %data, i8 %mask) { 663 %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask) 664 ret <4 x float> %res 665} 666 667declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask) 668 669; CHECK-LABEL: compr7 670; CHECK-NOT: vcompress 671; CHECK: vmovapd 672define void @compr7(i8* %addr, <8 x double> %data) { 673 call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1) 674 ret void 675} 676 677; CHECK-LABEL: compr8 678; CHECK-NOT: vcompressps %xmm0 679define <4 x float> @compr8(<4 x float> %data) { 680 %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1) 681 ret <4 x float> %res 682} 683 684; CHECK-LABEL: compr9 685; CHECK: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07] 686define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) { 687 call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask) 688 ret void 689} 690 691declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask) 692 693; CHECK-LABEL: compr10 694; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0] 695define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) { 696 %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask) 697 ret <4 x i32> %res 698} 699 700declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) 701 702; Expand 703 704; CHECK-LABEL: expand1 705; CHECK: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07] 706define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) { 707 %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask) 708 ret <8 x double> %res 709} 710 711declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask) 712 713; CHECK-LABEL: expand2 714; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07] 715define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) { 716 %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask) 717 ret <4 x double> %res 718} 719 720declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask) 721 722; CHECK-LABEL: expand3 723; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07] 724define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) { 725 %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask) 726 ret <4 x float> %res 727} 728 729declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask) 730 731; CHECK-LABEL: expand4 732; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0] 733define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) { 734 %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) 735 ret <8 x double> %res 736} 737 738declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) 739 740; CHECK-LABEL: expand5 741; CHECK: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8] 742define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) { 743 %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask) 744 ret <4 x double> %res 745} 746 747declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask) 748 749; CHECK-LABEL: expand6 750; CHECK: vexpandps %xmm0 751define <4 x float> @expand6(<4 x float> %data, i8 %mask) { 752 %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask) 753 ret <4 x float> %res 754} 755 756declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask) 757 758; CHECK-LABEL: expand7 759; CHECK-NOT: vexpand 760; CHECK: vmovapd 761define <8 x double> @expand7(i8* %addr, <8 x double> %data) { 762 %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1) 763 ret <8 x double> %res 764} 765 766; CHECK-LABEL: expand8 767; CHECK-NOT: vexpandps %xmm0 768define <4 x float> @expand8(<4 x float> %data) { 769 %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1) 770 ret <4 x float> %res 771} 772 773; CHECK-LABEL: expand9 774; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07] 775define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) { 776 %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask) 777 ret <8 x i64> %res 778} 779 780declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask) 781 782; CHECK-LABEL: expand10 783; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0] 784define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) { 785 %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask) 786 ret <4 x i32> %res 787} 788 789declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) 790 791define <8 x float> @test_x86_mask_blend_ps_256(i8 %a0, <8 x float> %a1, <8 x float> %a2) { 792 ; CHECK: vblendmps %ymm1, %ymm0 793 %res = call <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float> %a1, <8 x float> %a2, i8 %a0) ; <<8 x float>> [#uses=1] 794 ret <8 x float> %res 795} 796 797declare <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readonly 798 799define <4 x double> @test_x86_mask_blend_pd_256(i8 %a0, <4 x double> %a1, <4 x double> %a2) { 800 ; CHECK: vblendmpd %ymm1, %ymm0 801 %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a1, <4 x double> %a2, i8 %a0) ; <<4 x double>> [#uses=1] 802 ret <4 x double> %res 803} 804 805define <4 x double> @test_x86_mask_blend_pd_256_memop(<4 x double> %a, <4 x double>* %ptr, i8 %mask) { 806 ; CHECK-LABEL: test_x86_mask_blend_pd_256_memop 807 ; CHECK: vblendmpd (% 808 %b = load <4 x double>, <4 x double>* %ptr 809 %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a, <4 x double> %b, i8 %mask) ; <<4 x double>> [#uses=1] 810 ret <4 x double> %res 811} 812declare <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readonly 813 814; CHECK-LABEL: test_x86_mask_blend_d_256 815; CHECK: vpblendmd 816define <8 x i32> @test_x86_mask_blend_d_256(i8 %a0, <8 x i32> %a1, <8 x i32> %a2) { 817 %res = call <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32> %a1, <8 x i32> %a2, i8 %a0) ; <<8 x i32>> [#uses=1] 818 ret <8 x i32> %res 819} 820declare <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly 821 822define <4 x i64> @test_x86_mask_blend_q_256(i8 %a0, <4 x i64> %a1, <4 x i64> %a2) { 823 ; CHECK: vpblendmq 824 %res = call <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64> %a1, <4 x i64> %a2, i8 %a0) ; <<4 x i64>> [#uses=1] 825 ret <4 x i64> %res 826} 827declare <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64>, <4 x i64>, i8) nounwind readonly 828 829define <4 x float> @test_x86_mask_blend_ps_128(i8 %a0, <4 x float> %a1, <4 x float> %a2) { 830 ; CHECK: vblendmps %xmm1, %xmm0 831 %res = call <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float> %a1, <4 x float> %a2, i8 %a0) ; <<4 x float>> [#uses=1] 832 ret <4 x float> %res 833} 834 835declare <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly 836 837define <2 x double> @test_x86_mask_blend_pd_128(i8 %a0, <2 x double> %a1, <2 x double> %a2) { 838 ; CHECK: vblendmpd %xmm1, %xmm0 839 %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a1, <2 x double> %a2, i8 %a0) ; <<2 x double>> [#uses=1] 840 ret <2 x double> %res 841} 842 843define <2 x double> @test_x86_mask_blend_pd_128_memop(<2 x double> %a, <2 x double>* %ptr, i8 %mask) { 844 ; CHECK-LABEL: test_x86_mask_blend_pd_128_memop 845 ; CHECK: vblendmpd (% 846 %b = load <2 x double>, <2 x double>* %ptr 847 %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a, <2 x double> %b, i8 %mask) ; <<2 x double>> [#uses=1] 848 ret <2 x double> %res 849} 850declare <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double>, <2 x double>, i8) nounwind readonly 851 852define <4 x i32> @test_x86_mask_blend_d_128(i8 %a0, <4 x i32> %a1, <4 x i32> %a2) { 853 ; CHECK: vpblendmd 854 %res = call <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32> %a1, <4 x i32> %a2, i8 %a0) ; <<4 x i32>> [#uses=1] 855 ret <4 x i32> %res 856} 857declare <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly 858 859define <2 x i64> @test_x86_mask_blend_q_128(i8 %a0, <2 x i64> %a1, <2 x i64> %a2) { 860 ; CHECK: vpblendmq 861 %res = call <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64> %a1, <2 x i64> %a2, i8 %a0) ; <<2 x i64>> [#uses=1] 862 ret <2 x i64> %res 863} 864declare <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64>, <2 x i64>, i8) nounwind readonly 865