1; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 2 3define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 4; CHECK-LABEL: vzipi8: 5; CHECK: @ BB#0: 6; CHECK-NEXT: vldr d16, [r1] 7; CHECK-NEXT: vldr d17, [r0] 8; CHECK-NEXT: vzip.8 d17, d16 9; CHECK-NEXT: vadd.i8 d16, d17, d16 10; CHECK-NEXT: vmov r0, r1, d16 11; CHECK-NEXT: mov pc, lr 12 %tmp1 = load <8 x i8>, <8 x i8>* %A 13 %tmp2 = load <8 x i8>, <8 x i8>* %B 14 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 15 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 16 %tmp5 = add <8 x i8> %tmp3, %tmp4 17 ret <8 x i8> %tmp5 18} 19 20define <16 x i8> @vzipi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 21; CHECK-LABEL: vzipi8_Qres: 22; CHECK: @ BB#0: 23; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 24; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 25; CHECK-NEXT: vzip.8 [[LDR0]], [[LDR1]] 26; CHECK-NEXT: vmov r0, r1, [[LDR0]] 27; CHECK-NEXT: vmov r2, r3, [[LDR1]] 28; CHECK-NEXT: mov pc, lr 29 %tmp1 = load <8 x i8>, <8 x i8>* %A 30 %tmp2 = load <8 x i8>, <8 x i8>* %B 31 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 32 ret <16 x i8> %tmp3 33} 34 35define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 36; CHECK-LABEL: vzipi16: 37; CHECK: @ BB#0: 38; CHECK-NEXT: vldr d16, [r1] 39; CHECK-NEXT: vldr d17, [r0] 40; CHECK-NEXT: vzip.16 d17, d16 41; CHECK-NEXT: vadd.i16 d16, d17, d16 42; CHECK-NEXT: vmov r0, r1, d16 43; CHECK-NEXT: mov pc, lr 44 %tmp1 = load <4 x i16>, <4 x i16>* %A 45 %tmp2 = load <4 x i16>, <4 x i16>* %B 46 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 47 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 48 %tmp5 = add <4 x i16> %tmp3, %tmp4 49 ret <4 x i16> %tmp5 50} 51 52define <8 x i16> @vzipi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { 53; CHECK-LABEL: vzipi16_Qres: 54; CHECK: @ BB#0: 55; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 56; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 57; CHECK-NEXT: vzip.16 [[LDR0]], [[LDR1]] 58; CHECK-NEXT: vmov r0, r1, [[LDR0]] 59; CHECK-NEXT: vmov r2, r3, [[LDR1]] 60; CHECK-NEXT: mov pc, lr 61 %tmp1 = load <4 x i16>, <4 x i16>* %A 62 %tmp2 = load <4 x i16>, <4 x i16>* %B 63 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 64 ret <8 x i16> %tmp3 65} 66 67; VZIP.32 is equivalent to VTRN.32 for 64-bit vectors. 68 69define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 70; CHECK-LABEL: vzipQi8: 71; CHECK: @ BB#0: 72; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 73; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 74; CHECK-NEXT: vzip.8 q9, q8 75; CHECK-NEXT: vadd.i8 q8, q9, q8 76; CHECK-NEXT: vmov r0, r1, d16 77; CHECK-NEXT: vmov r2, r3, d17 78; CHECK-NEXT: mov pc, lr 79 %tmp1 = load <16 x i8>, <16 x i8>* %A 80 %tmp2 = load <16 x i8>, <16 x i8>* %B 81 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 82 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 83 %tmp5 = add <16 x i8> %tmp3, %tmp4 84 ret <16 x i8> %tmp5 85} 86 87define <32 x i8> @vzipQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { 88; CHECK-LABEL: vzipQi8_QQres: 89; CHECK: @ BB#0: 90; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 91; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 92; CHECK-NEXT: vzip.8 q9, q8 93; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! 94; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 95; CHECK-NEXT: mov pc, lr 96 %tmp1 = load <16 x i8>, <16 x i8>* %A 97 %tmp2 = load <16 x i8>, <16 x i8>* %B 98 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 99 ret <32 x i8> %tmp3 100} 101 102define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 103; CHECK-LABEL: vzipQi16: 104; CHECK: @ BB#0: 105; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 106; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 107; CHECK-NEXT: vzip.16 q9, q8 108; CHECK-NEXT: vadd.i16 q8, q9, q8 109; CHECK-NEXT: vmov r0, r1, d16 110; CHECK-NEXT: vmov r2, r3, d17 111; CHECK-NEXT: mov pc, lr 112 %tmp1 = load <8 x i16>, <8 x i16>* %A 113 %tmp2 = load <8 x i16>, <8 x i16>* %B 114 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 115 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 116 %tmp5 = add <8 x i16> %tmp3, %tmp4 117 ret <8 x i16> %tmp5 118} 119 120define <16 x i16> @vzipQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 121; CHECK-LABEL: vzipQi16_QQres: 122; CHECK: @ BB#0: 123; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 124; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 125; CHECK-NEXT: vzip.16 q9, q8 126; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 127; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 128; CHECK-NEXT: mov pc, lr 129 %tmp1 = load <8 x i16>, <8 x i16>* %A 130 %tmp2 = load <8 x i16>, <8 x i16>* %B 131 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 132 ret <16 x i16> %tmp3 133} 134 135define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 136; CHECK-LABEL: vzipQi32: 137; CHECK: @ BB#0: 138; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 139; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 140; CHECK-NEXT: vzip.32 q9, q8 141; CHECK-NEXT: vadd.i32 q8, q9, q8 142; CHECK-NEXT: vmov r0, r1, d16 143; CHECK-NEXT: vmov r2, r3, d17 144; CHECK-NEXT: mov pc, lr 145 %tmp1 = load <4 x i32>, <4 x i32>* %A 146 %tmp2 = load <4 x i32>, <4 x i32>* %B 147 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 148 %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 149 %tmp5 = add <4 x i32> %tmp3, %tmp4 150 ret <4 x i32> %tmp5 151} 152 153define <8 x i32> @vzipQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { 154; CHECK-LABEL: vzipQi32_QQres: 155; CHECK: @ BB#0: 156; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 157; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 158; CHECK-NEXT: vzip.32 q9, q8 159; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 160; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 161; CHECK-NEXT: mov pc, lr 162 %tmp1 = load <4 x i32>, <4 x i32>* %A 163 %tmp2 = load <4 x i32>, <4 x i32>* %B 164 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 165 ret <8 x i32> %tmp3 166} 167 168define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind { 169; CHECK-LABEL: vzipQf: 170; CHECK: @ BB#0: 171; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 172; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 173; CHECK-NEXT: vzip.32 q9, q8 174; CHECK-NEXT: vadd.f32 q8, q9, q8 175; CHECK-NEXT: vmov r0, r1, d16 176; CHECK-NEXT: vmov r2, r3, d17 177; CHECK-NEXT: mov pc, lr 178 %tmp1 = load <4 x float>, <4 x float>* %A 179 %tmp2 = load <4 x float>, <4 x float>* %B 180 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 181 %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 182 %tmp5 = fadd <4 x float> %tmp3, %tmp4 183 ret <4 x float> %tmp5 184} 185 186define <8 x float> @vzipQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { 187; CHECK-LABEL: vzipQf_QQres: 188; CHECK: @ BB#0: 189; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 190; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 191; CHECK-NEXT: vzip.32 q9, q8 192; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 193; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 194; CHECK-NEXT: mov pc, lr 195 %tmp1 = load <4 x float>, <4 x float>* %A 196 %tmp2 = load <4 x float>, <4 x float>* %B 197 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 198 ret <8 x float> %tmp3 199} 200 201; Undef shuffle indices should not prevent matching to VZIP: 202 203define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { 204; CHECK-LABEL: vzipi8_undef: 205; CHECK: @ BB#0: 206; CHECK-NEXT: vldr d16, [r1] 207; CHECK-NEXT: vldr d17, [r0] 208; CHECK-NEXT: vzip.8 d17, d16 209; CHECK-NEXT: vadd.i8 d16, d17, d16 210; CHECK-NEXT: vmov r0, r1, d16 211; CHECK-NEXT: mov pc, lr 212 %tmp1 = load <8 x i8>, <8 x i8>* %A 213 %tmp2 = load <8 x i8>, <8 x i8>* %B 214 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11> 215 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15> 216 %tmp5 = add <8 x i8> %tmp3, %tmp4 217 ret <8 x i8> %tmp5 218} 219 220define <16 x i8> @vzipi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 221; CHECK-LABEL: vzipi8_undef_Qres: 222; CHECK: @ BB#0: 223; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 224; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 225; CHECK-NEXT: vzip.8 [[LDR0]], [[LDR1]] 226; CHECK-NEXT: vmov r0, r1, [[LDR0]] 227; CHECK-NEXT: vmov r2, r3, [[LDR1]] 228; CHECK-NEXT: mov pc, lr 229 %tmp1 = load <8 x i8>, <8 x i8>* %A 230 %tmp2 = load <8 x i8>, <8 x i8>* %B 231 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15> 232 ret <16 x i8> %tmp3 233} 234 235define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind { 236; CHECK-LABEL: vzipQi8_undef: 237; CHECK: @ BB#0: 238; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 239; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 240; CHECK-NEXT: vzip.8 q9, q8 241; CHECK-NEXT: vadd.i8 q8, q9, q8 242; CHECK-NEXT: vmov r0, r1, d16 243; CHECK-NEXT: vmov r2, r3, d17 244; CHECK-NEXT: mov pc, lr 245 %tmp1 = load <16 x i8>, <16 x i8>* %A 246 %tmp2 = load <16 x i8>, <16 x i8>* %B 247 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 248 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31> 249 %tmp5 = add <16 x i8> %tmp3, %tmp4 250 ret <16 x i8> %tmp5 251} 252 253define <32 x i8> @vzipQi8_undef_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { 254; CHECK-LABEL: vzipQi8_undef_QQres: 255; CHECK: @ BB#0: 256; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 257; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 258; CHECK-NEXT: vzip.8 q9, q8 259; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! 260; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 261; CHECK-NEXT: mov pc, lr 262 %tmp1 = load <16 x i8>, <16 x i8>* %A 263 %tmp2 = load <16 x i8>, <16 x i8>* %B 264 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31> 265 ret <32 x i8> %tmp3 266} 267 268define <8 x i16> @vzip_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { 269entry: 270 ; CHECK-LABEL: vzip_lower_shufflemask_undef 271 ; CHECK: vzip 272 %tmp1 = load <4 x i16>, <4 x i16>* %A 273 %tmp2 = load <4 x i16>, <4 x i16>* %B 274 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7> 275 ret <8 x i16> %0 276} 277 278define <4 x i32> @vzip_lower_shufflemask_zeroed(<2 x i32>* %A) { 279entry: 280 ; CHECK-LABEL: vzip_lower_shufflemask_zeroed 281 ; CHECK-NOT: vtrn 282 ; CHECK: vzip 283 %tmp1 = load <2 x i32>, <2 x i32>* %A 284 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 0, i32 1, i32 0> 285 ret <4 x i32> %0 286} 287 288define <4 x i32> @vzip_lower_shufflemask_vuzp(<2 x i32>* %A) { 289entry: 290 ; CHECK-LABEL: vzip_lower_shufflemask_vuzp 291 ; CHECK-NOT: vuzp 292 ; CHECK: vzip 293 %tmp1 = load <2 x i32>, <2 x i32>* %A 294 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 0> 295 ret <4 x i32> %0 296} 297 298define void @vzip_undef_rev_shufflemask_vtrn(<2 x i32>* %A, <4 x i32>* %B) { 299entry: 300 ; CHECK-LABEL: vzip_undef_rev_shufflemask_vtrn 301 ; CHECK-NOT: vtrn 302 ; CHECK: vzip 303 %tmp1 = load <2 x i32>, <2 x i32>* %A 304 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 305 store <4 x i32> %0, <4 x i32>* %B 306 ret void 307} 308 309define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) { 310entry: 311 ; CHECK-LABEL: vzip_vext_factor 312 ; CHECK: vext.16 d16, d16, d17, #3 313 ; CHECK: vzip 314 %tmp1 = load <8 x i16>, <8 x i16>* %A 315 %0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 4, i32 5, i32 3> 316 store <4 x i16> %0, <4 x i16>* %B 317 ret void 318} 319