1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 3 4define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5; CHECK-LABEL: vuzpi8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vldr d16, [r1] 8; CHECK-NEXT: vldr d17, [r0] 9; CHECK-NEXT: vuzp.8 d17, d16 10; CHECK-NEXT: vmul.i8 d16, d17, d16 11; CHECK-NEXT: vmov r0, r1, d16 12; CHECK-NEXT: mov pc, lr 13 %tmp1 = load <8 x i8>, <8 x i8>* %A 14 %tmp2 = load <8 x i8>, <8 x i8>* %B 15 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 16 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 17 %tmp5 = mul <8 x i8> %tmp3, %tmp4 18 ret <8 x i8> %tmp5 19} 20 21define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 22; CHECK-LABEL: vuzpi8_Qres: 23; CHECK: @ %bb.0: 24; CHECK-NEXT: vldr d17, [r1] 25; CHECK-NEXT: vldr d16, [r0] 26; CHECK-NEXT: vuzp.8 d16, d17 27; CHECK-NEXT: vmov r0, r1, d16 28; CHECK-NEXT: vmov r2, r3, d17 29; CHECK-NEXT: mov pc, lr 30 %tmp1 = load <8 x i8>, <8 x i8>* %A 31 %tmp2 = load <8 x i8>, <8 x i8>* %B 32 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 33 ret <16 x i8> %tmp3 34} 35 36define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 37; CHECK-LABEL: vuzpi16: 38; CHECK: @ %bb.0: 39; CHECK-NEXT: vldr d16, [r1] 40; CHECK-NEXT: vldr d17, [r0] 41; CHECK-NEXT: vuzp.16 d17, d16 42; CHECK-NEXT: vmul.i16 d16, d17, d16 43; CHECK-NEXT: vmov r0, r1, d16 44; CHECK-NEXT: mov pc, lr 45 %tmp1 = load <4 x i16>, <4 x i16>* %A 46 %tmp2 = load <4 x i16>, <4 x i16>* %B 47 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 48 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 49 %tmp5 = mul <4 x i16> %tmp3, %tmp4 50 ret <4 x i16> %tmp5 51} 52 53define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { 54; CHECK-LABEL: vuzpi16_Qres: 55; CHECK: @ %bb.0: 56; CHECK-NEXT: vldr d17, [r1] 57; CHECK-NEXT: vldr d16, [r0] 58; CHECK-NEXT: vuzp.16 d16, d17 59; CHECK-NEXT: vmov r0, r1, d16 60; CHECK-NEXT: vmov r2, r3, d17 61; CHECK-NEXT: mov pc, lr 62 %tmp1 = load <4 x i16>, <4 x i16>* %A 63 %tmp2 = load <4 x i16>, <4 x i16>* %B 64 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 65 ret <8 x i16> %tmp3 66} 67 68; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors. 69 70define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 71; CHECK-LABEL: vuzpQi8: 72; CHECK: @ %bb.0: 73; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 74; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 75; CHECK-NEXT: vuzp.8 q9, q8 76; CHECK-NEXT: vadd.i8 q8, q9, q8 77; CHECK-NEXT: vmov r0, r1, d16 78; CHECK-NEXT: vmov r2, r3, d17 79; CHECK-NEXT: mov pc, lr 80 %tmp1 = load <16 x i8>, <16 x i8>* %A 81 %tmp2 = load <16 x i8>, <16 x i8>* %B 82 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 83 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 84 %tmp5 = add <16 x i8> %tmp3, %tmp4 85 ret <16 x i8> %tmp5 86} 87 88define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { 89; CHECK-LABEL: vuzpQi8_QQres: 90; CHECK: @ %bb.0: 91; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 92; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 93; CHECK-NEXT: vuzp.8 q9, q8 94; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! 95; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 96; CHECK-NEXT: mov pc, lr 97 %tmp1 = load <16 x i8>, <16 x i8>* %A 98 %tmp2 = load <16 x i8>, <16 x i8>* %B 99 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 100 ret <32 x i8> %tmp3 101} 102 103define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 104; CHECK-LABEL: vuzpQi16: 105; CHECK: @ %bb.0: 106; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 107; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 108; CHECK-NEXT: vuzp.16 q9, q8 109; CHECK-NEXT: vadd.i16 q8, q9, q8 110; CHECK-NEXT: vmov r0, r1, d16 111; CHECK-NEXT: vmov r2, r3, d17 112; CHECK-NEXT: mov pc, lr 113 %tmp1 = load <8 x i16>, <8 x i16>* %A 114 %tmp2 = load <8 x i16>, <8 x i16>* %B 115 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 116 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 117 %tmp5 = add <8 x i16> %tmp3, %tmp4 118 ret <8 x i16> %tmp5 119} 120 121define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 122; CHECK-LABEL: vuzpQi16_QQres: 123; CHECK: @ %bb.0: 124; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 125; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 126; CHECK-NEXT: vuzp.16 q9, q8 127; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 128; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 129; CHECK-NEXT: mov pc, lr 130 %tmp1 = load <8 x i16>, <8 x i16>* %A 131 %tmp2 = load <8 x i16>, <8 x i16>* %B 132 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 133 ret <16 x i16> %tmp3 134} 135 136define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 137; CHECK-LABEL: vuzpQi32: 138; CHECK: @ %bb.0: 139; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 140; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 141; CHECK-NEXT: vuzp.32 q9, q8 142; CHECK-NEXT: vadd.i32 q8, q9, q8 143; CHECK-NEXT: vmov r0, r1, d16 144; CHECK-NEXT: vmov r2, r3, d17 145; CHECK-NEXT: mov pc, lr 146 %tmp1 = load <4 x i32>, <4 x i32>* %A 147 %tmp2 = load <4 x i32>, <4 x i32>* %B 148 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 149 %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 150 %tmp5 = add <4 x i32> %tmp3, %tmp4 151 ret <4 x i32> %tmp5 152} 153 154define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { 155; CHECK-LABEL: vuzpQi32_QQres: 156; CHECK: @ %bb.0: 157; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 158; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 159; CHECK-NEXT: vuzp.32 q9, q8 160; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 161; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 162; CHECK-NEXT: mov pc, lr 163 %tmp1 = load <4 x i32>, <4 x i32>* %A 164 %tmp2 = load <4 x i32>, <4 x i32>* %B 165 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 166 ret <8 x i32> %tmp3 167} 168 169define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { 170; CHECK-LABEL: vuzpQf: 171; CHECK: @ %bb.0: 172; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 173; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 174; CHECK-NEXT: vuzp.32 q9, q8 175; CHECK-NEXT: vadd.f32 q8, q9, q8 176; CHECK-NEXT: vmov r0, r1, d16 177; CHECK-NEXT: vmov r2, r3, d17 178; CHECK-NEXT: mov pc, lr 179 %tmp1 = load <4 x float>, <4 x float>* %A 180 %tmp2 = load <4 x float>, <4 x float>* %B 181 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 182 %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 183 %tmp5 = fadd <4 x float> %tmp3, %tmp4 184 ret <4 x float> %tmp5 185} 186 187define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { 188; CHECK-LABEL: vuzpQf_QQres: 189; CHECK: @ %bb.0: 190; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 191; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 192; CHECK-NEXT: vuzp.32 q9, q8 193; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 194; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 195; CHECK-NEXT: mov pc, lr 196 %tmp1 = load <4 x float>, <4 x float>* %A 197 %tmp2 = load <4 x float>, <4 x float>* %B 198 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 199 ret <8 x float> %tmp3 200} 201 202; Undef shuffle indices should not prevent matching to VUZP: 203 204define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { 205; CHECK-LABEL: vuzpi8_undef: 206; CHECK: @ %bb.0: 207; CHECK-NEXT: vldr d16, [r1] 208; CHECK-NEXT: vldr d17, [r0] 209; CHECK-NEXT: vuzp.8 d17, d16 210; CHECK-NEXT: vmul.i8 d16, d17, d16 211; CHECK-NEXT: vmov r0, r1, d16 212; CHECK-NEXT: mov pc, lr 213 %tmp1 = load <8 x i8>, <8 x i8>* %A 214 %tmp2 = load <8 x i8>, <8 x i8>* %B 215 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14> 216 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 217 %tmp5 = mul <8 x i8> %tmp3, %tmp4 218 ret <8 x i8> %tmp5 219} 220 221define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 222; CHECK-LABEL: vuzpi8_undef_Qres: 223; CHECK: @ %bb.0: 224; CHECK-NEXT: vldr d17, [r1] 225; CHECK-NEXT: vldr d16, [r0] 226; CHECK-NEXT: vuzp.8 d16, d17 227; CHECK-NEXT: vmov r0, r1, d16 228; CHECK-NEXT: vmov r2, r3, d17 229; CHECK-NEXT: mov pc, lr 230 %tmp1 = load <8 x i8>, <8 x i8>* %A 231 %tmp2 = load <8 x i8>, <8 x i8>* %B 232 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 233 ret <16 x i8> %tmp3 234} 235 236define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { 237; CHECK-LABEL: vuzpQi16_undef: 238; CHECK: @ %bb.0: 239; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 240; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 241; CHECK-NEXT: vuzp.16 q9, q8 242; CHECK-NEXT: vadd.i16 q8, q9, q8 243; CHECK-NEXT: vmov r0, r1, d16 244; CHECK-NEXT: vmov r2, r3, d17 245; CHECK-NEXT: mov pc, lr 246 %tmp1 = load <8 x i16>, <8 x i16>* %A 247 %tmp2 = load <8 x i16>, <8 x i16>* %B 248 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14> 249 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 250 %tmp5 = add <8 x i16> %tmp3, %tmp4 251 ret <8 x i16> %tmp5 252} 253 254define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 255; CHECK-LABEL: vuzpQi16_undef_QQres: 256; CHECK: @ %bb.0: 257; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 258; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 259; CHECK-NEXT: vuzp.16 q9, q8 260; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 261; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 262; CHECK-NEXT: mov pc, lr 263 %tmp1 = load <8 x i16>, <8 x i16>* %A 264 %tmp2 = load <8 x i16>, <8 x i16>* %B 265 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 266 ret <16 x i16> %tmp3 267} 268 269define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { 270; CHECK-LABEL: vuzp_lower_shufflemask_undef: 271; CHECK: @ %bb.0: @ %entry 272; CHECK-NEXT: vldr d17, [r1] 273; CHECK-NEXT: vldr d18, [r0] 274; CHECK-NEXT: vuzp.16 d18, d17 275; CHECK-NEXT: vmov r0, r1, d16 276; CHECK-NEXT: vmov r2, r3, d17 277; CHECK-NEXT: mov pc, lr 278entry: 279 %tmp1 = load <4 x i16>, <4 x i16>* %A 280 %tmp2 = load <4 x i16>, <4 x i16>* %B 281 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 282 ret <8 x i16> %0 283} 284 285define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { 286; CHECK-LABEL: vuzp_lower_shufflemask_zeroed: 287; CHECK: @ %bb.0: @ %entry 288; CHECK-NEXT: vldr d18, [r0] 289; CHECK-NEXT: vorr d19, d18, d18 290; CHECK-NEXT: vldr d17, [r1] 291; CHECK-NEXT: vtrn.32 d19, d17 292; CHECK-NEXT: vdup.32 d16, d18[0] 293; CHECK-NEXT: vmov r2, r3, d17 294; CHECK-NEXT: vmov r0, r1, d16 295; CHECK-NEXT: mov pc, lr 296entry: 297 %tmp1 = load <2 x i32>, <2 x i32>* %A 298 %tmp2 = load <2 x i32>, <2 x i32>* %B 299 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3> 300 ret <4 x i32> %0 301} 302 303define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { 304; CHECK-LABEL: vuzp_rev_shufflemask_vtrn: 305; CHECK: @ %bb.0: @ %entry 306; CHECK-NEXT: vldr d16, [r1] 307; CHECK-NEXT: vldr d17, [r0] 308; CHECK-NEXT: vtrn.32 d17, d16 309; CHECK-NEXT: vst1.64 {d16, d17}, [r2] 310; CHECK-NEXT: mov pc, lr 311entry: 312 %tmp1 = load <2 x i32>, <2 x i32>* %A 313 %tmp2 = load <2 x i32>, <2 x i32>* %B 314 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2> 315 store <4 x i32> %0, <4 x i32>* %C 316 ret void 317} 318 319define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { 320; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. 321; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to 322; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8. 323; CHECK-LABEL: cmpsel_trunc: 324; CHECK: @ %bb.0: 325; CHECK-NEXT: add r12, sp, #16 326; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 327; CHECK-NEXT: mov r12, sp 328; CHECK-NEXT: vld1.64 {d18, d19}, [r12] 329; CHECK-NEXT: add r12, sp, #48 330; CHECK-NEXT: vld1.64 {d20, d21}, [r12] 331; CHECK-NEXT: add r12, sp, #32 332; CHECK-NEXT: vcgt.u32 q8, q10, q8 333; CHECK-NEXT: vld1.64 {d20, d21}, [r12] 334; CHECK-NEXT: vcgt.u32 q9, q10, q9 335; CHECK-NEXT: vmov d20, r2, r3 336; CHECK-NEXT: vmovn.i32 d17, q8 337; CHECK-NEXT: vmovn.i32 d16, q9 338; CHECK-NEXT: vmov d18, r0, r1 339; CHECK-NEXT: vmovn.i16 d16, q8 340; CHECK-NEXT: vbsl d16, d18, d20 341; CHECK-NEXT: vmov r0, r1, d16 342; CHECK-NEXT: mov pc, lr 343 %c = icmp ult <8 x i32> %cmp0, %cmp1 344 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 345 ret <8 x i8> %res 346} 347 348; Shuffle the result from the compare with a <4 x i8>. 349; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able 350; to perform the vuzp and get the vbsl mask. 351define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, 352; CHECK-LABEL: vuzp_trunc_and_shuffle: 353; CHECK: @ %bb.0: 354; CHECK-NEXT: .save {r11, lr} 355; CHECK-NEXT: push {r11, lr} 356; CHECK-NEXT: add r12, sp, #8 357; CHECK-NEXT: add lr, sp, #24 358; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 359; CHECK-NEXT: ldr r12, [sp, #40] 360; CHECK-NEXT: vld1.64 {d18, d19}, [lr] 361; CHECK-NEXT: vcgt.u32 q8, q9, q8 362; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] 363; CHECK-NEXT: vmovl.u8 q9, d18 364; CHECK-NEXT: vmovn.i32 d16, q8 365; CHECK-NEXT: vmov d17, r2, r3 366; CHECK-NEXT: vuzp.8 d16, d18 367; CHECK-NEXT: vmov d18, r0, r1 368; CHECK-NEXT: vshl.i8 d16, d16, #7 369; CHECK-NEXT: vshr.s8 d16, d16, #7 370; CHECK-NEXT: vbsl d16, d18, d17 371; CHECK-NEXT: vmov r0, r1, d16 372; CHECK-NEXT: pop {r11, lr} 373; CHECK-NEXT: mov pc, lr 374 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 375 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 376 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 377 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 378 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 379 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 380 ret <8 x i8> %rv 381} 382 383; Use an undef value for the <4 x i8> that is being shuffled with the compare result. 384; This produces a build_vector with some of the operands undefs. 385define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, 386; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right: 387; CHECK: @ %bb.0: 388; CHECK-NEXT: mov r12, sp 389; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 390; CHECK-NEXT: add r12, sp, #16 391; CHECK-NEXT: vld1.64 {d18, d19}, [r12] 392; CHECK-NEXT: vcgt.u32 q8, q9, q8 393; CHECK-NEXT: vmov d18, r0, r1 394; CHECK-NEXT: vmovn.i32 d16, q8 395; CHECK-NEXT: vuzp.8 d16, d17 396; CHECK-NEXT: vmov d17, r2, r3 397; CHECK-NEXT: vshl.i8 d16, d16, #7 398; CHECK-NEXT: vshr.s8 d16, d16, #7 399; CHECK-NEXT: vbsl d16, d18, d17 400; CHECK-NEXT: vmov r0, r1, d16 401; CHECK-NEXT: mov pc, lr 402 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 403 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 404 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 405 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 406 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 407 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 408 ret <8 x i8> %rv 409} 410 411define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, 412; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left: 413; CHECK: @ %bb.0: 414; CHECK-NEXT: mov r12, sp 415; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 416; CHECK-NEXT: add r12, sp, #16 417; CHECK-NEXT: vld1.64 {d18, d19}, [r12] 418; CHECK-NEXT: vcgt.u32 q8, q9, q8 419; CHECK-NEXT: vldr d18, .LCPI22_0 420; CHECK-NEXT: vmovn.i32 d16, q8 421; CHECK-NEXT: vtbl.8 d16, {d16}, d18 422; CHECK-NEXT: vmov d17, r2, r3 423; CHECK-NEXT: vmov d18, r0, r1 424; CHECK-NEXT: vshl.i8 d16, d16, #7 425; CHECK-NEXT: vshr.s8 d16, d16, #7 426; CHECK-NEXT: vbsl d16, d18, d17 427; CHECK-NEXT: vmov r0, r1, d16 428; CHECK-NEXT: mov pc, lr 429; CHECK-NEXT: .p2align 3 430; CHECK-NEXT: @ %bb.1: 431; CHECK-NEXT: .LCPI22_0: 432; CHECK-NEXT: .byte 255 @ 0xff 433; CHECK-NEXT: .byte 255 @ 0xff 434; CHECK-NEXT: .byte 255 @ 0xff 435; CHECK-NEXT: .byte 255 @ 0xff 436; CHECK-NEXT: .byte 0 @ 0x0 437; CHECK-NEXT: .byte 2 @ 0x2 438; CHECK-NEXT: .byte 4 @ 0x4 439; CHECK-NEXT: .byte 6 @ 0x6 440 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 441 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 442 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 443 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 444 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 445 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 446 ret <8 x i8> %rv 447} 448 449; We're using large data types here, and we have to fill with undef values until we 450; get some vector size that we can represent. 451define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, 452; CHECK-LABEL: vuzp_wide_type: 453; CHECK: @ %bb.0: 454; CHECK-NEXT: .save {r4, lr} 455; CHECK-NEXT: push {r4, lr} 456; CHECK-NEXT: add r12, sp, #32 457; CHECK-NEXT: add lr, sp, #48 458; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32] 459; CHECK-NEXT: add r12, sp, #24 460; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32] 461; CHECK-NEXT: add r12, sp, #56 462; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32] 463; CHECK-NEXT: vld1.32 {d18[0]}, [lr:32] 464; CHECK-NEXT: add lr, sp, #40 465; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32] 466; CHECK-NEXT: ldr r12, [sp, #68] 467; CHECK-NEXT: ldr r4, [r12] 468; CHECK-NEXT: vmov.32 d23[0], r4 469; CHECK-NEXT: add r4, sp, #64 470; CHECK-NEXT: vld1.32 {d24[0]}, [r4:32] 471; CHECK-NEXT: add r4, sp, #36 472; CHECK-NEXT: vcgt.u32 q10, q12, q10 473; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32] 474; CHECK-NEXT: add r4, sp, #28 475; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32] 476; CHECK-NEXT: add r4, sp, #60 477; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32] 478; CHECK-NEXT: add r4, sp, #52 479; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32] 480; CHECK-NEXT: add r4, r12, #4 481; CHECK-NEXT: vcgt.u32 q8, q9, q8 482; CHECK-NEXT: vmovn.i32 d19, q10 483; CHECK-NEXT: vmov.u8 lr, d23[3] 484; CHECK-NEXT: vmovn.i32 d18, q8 485; CHECK-NEXT: vmovn.i16 d22, q9 486; CHECK-NEXT: vldr d18, .LCPI23_0 487; CHECK-NEXT: vmov.8 d17[0], lr 488; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d18 489; CHECK-NEXT: vmov d19, r2, r3 490; CHECK-NEXT: vld1.8 {d17[1]}, [r4] 491; CHECK-NEXT: add r4, sp, #8 492; CHECK-NEXT: vmov d18, r0, r1 493; CHECK-NEXT: vshl.i8 q8, q8, #7 494; CHECK-NEXT: vld1.64 {d20, d21}, [r4] 495; CHECK-NEXT: vshr.s8 q8, q8, #7 496; CHECK-NEXT: vbsl q8, q9, q10 497; CHECK-NEXT: vmov r0, r1, d16 498; CHECK-NEXT: vmov r2, r3, d17 499; CHECK-NEXT: pop {r4, lr} 500; CHECK-NEXT: mov pc, lr 501; CHECK-NEXT: .p2align 3 502; CHECK-NEXT: @ %bb.1: 503; CHECK-NEXT: .LCPI23_0: 504; CHECK-NEXT: .byte 0 @ 0x0 505; CHECK-NEXT: .byte 1 @ 0x1 506; CHECK-NEXT: .byte 2 @ 0x2 507; CHECK-NEXT: .byte 3 @ 0x3 508; CHECK-NEXT: .byte 4 @ 0x4 509; CHECK-NEXT: .byte 8 @ 0x8 510; CHECK-NEXT: .byte 9 @ 0x9 511; CHECK-NEXT: .byte 10 @ 0xa 512 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) { 513 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4 514 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1> 515 %c0 = icmp ult <5 x i32> %cmp0, %cmp1 516 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> 517 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 518 ret <10 x i8> %rv 519} 520 521%struct.uint8x8x2_t = type { [2 x <8 x i8>] } 522define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 { 523; CHECK-LABEL: vuzp_extract_subvector: 524; CHECK: @ %bb.0: 525; CHECK-NEXT: vmov d16, r2, r3 526; CHECK-NEXT: vmov d17, r0, r1 527; CHECK-NEXT: vuzp.8 d17, d16 528; CHECK-NEXT: vmov r0, r1, d17 529; CHECK-NEXT: vmov r2, r3, d16 530; CHECK-NEXT: mov pc, lr 531 532 %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 533 %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 534 %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 535 %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 536 ret %struct.uint8x8x2_t %.fca.0.1.insert 537} 538