1; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 3; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK 4; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 17 18target triple = "aarch64-unknown-linux-gnu" 19 20; Don't use SVE when its registers are no bigger than NEON. 21; NO_SVE-NOT: z{0-9} 22 23; 24; sext i1 -> i32 25; 26 27; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg 28; type's element type is not byte based and thus cannot be lowered directly to 29; an SVE instruction. 30define void @sext_v8i1_v8i32(<8 x i1> %a, <8 x i32>* %out) #0 { 31; CHECK-LABEL: sext_v8i1_v8i32: 32; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 33; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b 34; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 35; CHECK-NEXT: lsl [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31 36; CHECK-NEXT: asr [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, #31 37; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0] 38; CHECK-NEXT: ret 39 %b = sext <8 x i1> %a to <8 x i32> 40 store <8 x i32> %b, <8 x i32>* %out 41 ret void 42} 43 44; 45; sext i3 -> i64 46; 47 48; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg 49; type's element type is not power-of-2 based and thus cannot be lowered 50; directly to an SVE instruction. 51define void @sext_v4i3_v4i64(<4 x i3> %a, <4 x i64>* %out) #0 { 52; CHECK-LABEL: sext_v4i3_v4i64: 53; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 54; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h 55; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 56; CHECK-NEXT: lsl [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61 57; CHECK-NEXT: asr [[A_DWORDS]].d, [[PG]]/m, [[A_DWORDS]].d, #61 58; CHECK-NEXT: st1d { [[A_WORDS]].d }, [[PG]], [x0] 59; CHECK-NEXT: ret 60 %b = sext <4 x i3> %a to <4 x i64> 61 store <4 x i64> %b, <4 x i64>* %out 62 ret void 63} 64 65; 66; sext i8 -> i16 67; 68 69define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 { 70; CHECK-LABEL: sext_v16i8_v16i16: 71; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 72; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b 73; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0] 74; CHECK-NEXT: ret 75 %b = sext <16 x i8> %a to <16 x i16> 76 store <16 x i16>%b, <16 x i16>* %out 77 ret void 78} 79 80; NOTE: Extra 'add' is to prevent the extend being combined with the load. 81define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 { 82; CHECK-LABEL: sext_v32i8_v32i16: 83; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 84; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 85; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32 86; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] 87; VBITS_GE_512-NEXT: ret 88 %a = load <32 x i8>, <32 x i8>* %in 89 %b = add <32 x i8> %a, %a 90 %c = sext <32 x i8> %b to <32 x i16> 91 store <32 x i16> %c, <32 x i16>* %out 92 ret void 93} 94 95define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 { 96; CHECK-LABEL: sext_v64i8_v64i16: 97; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 98; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 99; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64 100; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] 101; VBITS_GE_1024-NEXT: ret 102 %a = load <64 x i8>, <64 x i8>* %in 103 %b = add <64 x i8> %a, %a 104 %c = sext <64 x i8> %b to <64 x i16> 105 store <64 x i16> %c, <64 x i16>* %out 106 ret void 107} 108 109define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 { 110; CHECK-LABEL: sext_v128i8_v128i16: 111; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 112; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 113; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128 114; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] 115; VBITS_GE_2048-NEXT: ret 116 %a = load <128 x i8>, <128 x i8>* %in 117 %b = add <128 x i8> %a, %a 118 %c = sext <128 x i8> %b to <128 x i16> 119 store <128 x i16> %c, <128 x i16>* %out 120 ret void 121} 122 123; 124; sext i8 -> i32 125; 126 127define void @sext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 { 128; CHECK-LABEL: sext_v8i8_v8i32: 129; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 130; CHECK-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b 131; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 132; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0] 133; CHECK-NEXT: ret 134 %b = sext <8 x i8> %a to <8 x i32> 135 store <8 x i32>%b, <8 x i32>* %out 136 ret void 137} 138 139define void @sext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 { 140; CHECK-LABEL: sext_v16i8_v16i32: 141; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 142; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 143; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 144; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0] 145; VBITS_GE_512-NEXT: ret 146 147; Ensure sensible type legalisation. 148; VBITS_EQ_256: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8 149; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b 150; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b 151; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h 152; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_HI:z[0-9]+]].s, [[A_HALFS_HI]].h 153; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 154; VBITS_EQ_256-DAG: add x[[OUT_HI:[0-9]+]], x0, #32 155; VBITS_EQ_256-DAG: st1w { [[A_WORDS_LO]].s }, [[PG]], [x0] 156; VBITS_EQ_256-DAG: st1w { [[A_WORDS_HI]].s }, [[PG]], [x[[OUT_HI]]] 157; VBITS_EQ_256-NEXT: ret 158 %b = sext <16 x i8> %a to <16 x i32> 159 store <16 x i32> %b, <16 x i32>* %out 160 ret void 161} 162 163define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 { 164; CHECK-LABEL: sext_v32i8_v32i32: 165; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 166; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 167; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 168; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 169; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 170; VBITS_GE_1024-NEXT: ret 171 %a = load <32 x i8>, <32 x i8>* %in 172 %b = add <32 x i8> %a, %a 173 %c = sext <32 x i8> %b to <32 x i32> 174 store <32 x i32> %c, <32 x i32>* %out 175 ret void 176} 177 178define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 { 179; CHECK-LABEL: sext_v64i8_v64i32: 180; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 181; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 182; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 183; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 184; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 185; VBITS_GE_2048-NEXT: ret 186 %a = load <64 x i8>, <64 x i8>* %in 187 %b = add <64 x i8> %a, %a 188 %c = sext <64 x i8> %b to <64 x i32> 189 store <64 x i32> %c, <64 x i32>* %out 190 ret void 191} 192 193; 194; sext i8 -> i64 195; 196 197; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The sign 198; extend is a two step process where the container is any_extend'd with the 199; result feeding an inreg sign extend. 200define void @sext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 { 201; CHECK-LABEL: sext_v4i8_v4i64: 202; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 203; CHECK-NEXT: uunpklo [[ANYEXT_W:z[0-9]+]].s, z0.h 204; CHECK-NEXT: uunpklo [[ANYEXT_D:z[0-9]+]].d, [[ANYEXT_W]].s 205; CHECK-NEXT: sxtb [[A_DWORDS:z[0-9]+]].d, [[PG]]/m, [[ANYEXT_D]].d 206; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 207; CHECK-NEXT: ret 208 %b = sext <4 x i8> %a to <4 x i64> 209 store <4 x i64>%b, <4 x i64>* %out 210 ret void 211} 212 213define void @sext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 { 214; CHECK-LABEL: sext_v8i8_v8i64: 215; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 216; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 217; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 218; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 219; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 220; VBITS_GE_512-NEXT: ret 221 %b = sext <8 x i8> %a to <8 x i64> 222 store <8 x i64>%b, <8 x i64>* %out 223 ret void 224} 225 226define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 { 227; CHECK-LABEL: sext_v16i8_v16i64: 228; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 229; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 230; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 231; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 232; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 233; VBITS_GE_1024-NEXT: ret 234 %b = sext <16 x i8> %a to <16 x i64> 235 store <16 x i64> %b, <16 x i64>* %out 236 ret void 237} 238 239define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 { 240; CHECK-LABEL: sext_v32i8_v32i64: 241; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 242; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 243; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 244; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 245; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 246; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 247; VBITS_GE_2048-NEXT: ret 248 %a = load <32 x i8>, <32 x i8>* %in 249 %b = add <32 x i8> %a, %a 250 %c = sext <32 x i8> %b to <32 x i64> 251 store <32 x i64> %c, <32 x i64>* %out 252 ret void 253} 254 255; 256; sext i16 -> i32 257; 258 259define void @sext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 { 260; CHECK-LABEL: sext_v8i16_v8i32: 261; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 262; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h 263; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0] 264; CHECK-NEXT: ret 265 %b = sext <8 x i16> %a to <8 x i32> 266 store <8 x i32>%b, <8 x i32>* %out 267 ret void 268} 269 270define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 { 271; CHECK-LABEL: sext_v16i16_v16i32: 272; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 273; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 274; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16 275; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 276; VBITS_GE_512-NEXT: ret 277 %a = load <16 x i16>, <16 x i16>* %in 278 %b = add <16 x i16> %a, %a 279 %c = sext <16 x i16> %b to <16 x i32> 280 store <16 x i32> %c, <16 x i32>* %out 281 ret void 282} 283 284define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 { 285; CHECK-LABEL: sext_v32i16_v32i32: 286; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 287; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 288; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 289; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 290; VBITS_GE_1024-NEXT: ret 291 %a = load <32 x i16>, <32 x i16>* %in 292 %b = add <32 x i16> %a, %a 293 %c = sext <32 x i16> %b to <32 x i32> 294 store <32 x i32> %c, <32 x i32>* %out 295 ret void 296} 297 298define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 { 299; CHECK-LABEL: sext_v64i16_v64i32: 300; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 301; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 302; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 303; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 304; VBITS_GE_2048-NEXT: ret 305 %a = load <64 x i16>, <64 x i16>* %in 306 %b = add <64 x i16> %a, %a 307 %c = sext <64 x i16> %b to <64 x i32> 308 store <64 x i32> %c, <64 x i32>* %out 309 ret void 310} 311 312; 313; sext i16 -> i64 314; 315 316define void @sext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 { 317; CHECK-LABEL: sext_v4i16_v4i64: 318; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 319; CHECK-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h 320; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 321; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 322; CHECK-NEXT: ret 323 %b = sext <4 x i16> %a to <4 x i64> 324 store <4 x i64>%b, <4 x i64>* %out 325 ret void 326} 327 328define void @sext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 { 329; CHECK-LABEL: sext_v8i16_v8i64: 330; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 331; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, z0.h 332; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 333; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 334; VBITS_GE_512-NEXT: ret 335 %b = sext <8 x i16> %a to <8 x i64> 336 store <8 x i64>%b, <8 x i64>* %out 337 ret void 338} 339 340define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 { 341; CHECK-LABEL: sext_v16i16_v16i64: 342; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 343; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 344; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 345; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 346; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 347; VBITS_GE_1024-NEXT: ret 348 %a = load <16 x i16>, <16 x i16>* %in 349 %b = add <16 x i16> %a, %a 350 %c = sext <16 x i16> %b to <16 x i64> 351 store <16 x i64> %c, <16 x i64>* %out 352 ret void 353} 354 355define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 { 356; CHECK-LABEL: sext_v32i16_v32i64: 357; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 358; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 359; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 360; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 361; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 362; VBITS_GE_2048-NEXT: ret 363 %a = load <32 x i16>, <32 x i16>* %in 364 %b = add <32 x i16> %a, %a 365 %c = sext <32 x i16> %b to <32 x i64> 366 store <32 x i64> %c, <32 x i64>* %out 367 ret void 368} 369 370; 371; sext i32 -> i64 372; 373 374define void @sext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 { 375; CHECK-LABEL: sext_v4i32_v4i64: 376; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 377; CHECK-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, z0.s 378; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 379; CHECK-NEXT: ret 380 %b = sext <4 x i32> %a to <4 x i64> 381 store <4 x i64>%b, <4 x i64>* %out 382 ret void 383} 384 385define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 { 386; CHECK-LABEL: sext_v8i32_v8i64: 387; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s 388; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 389; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 390; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 391; VBITS_GE_512-NEXT: ret 392 %a = load <8 x i32>, <8 x i32>* %in 393 %b = add <8 x i32> %a, %a 394 %c = sext <8 x i32> %b to <8 x i64> 395 store <8 x i64> %c, <8 x i64>* %out 396 ret void 397} 398 399define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 { 400; CHECK-LABEL: sext_v16i32_v16i64: 401; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s 402; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 403; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 404; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 405; VBITS_GE_1024-NEXT: ret 406 %a = load <16 x i32>, <16 x i32>* %in 407 %b = add <16 x i32> %a, %a 408 %c = sext <16 x i32> %b to <16 x i64> 409 store <16 x i64> %c, <16 x i64>* %out 410 ret void 411} 412 413define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 { 414; CHECK-LABEL: sext_v32i32_v32i64: 415; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s 416; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 417; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 418; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 419; VBITS_GE_2048-NEXT: ret 420 %a = load <32 x i32>, <32 x i32>* %in 421 %b = add <32 x i32> %a, %a 422 %c = sext <32 x i32> %b to <32 x i64> 423 store <32 x i64> %c, <32 x i64>* %out 424 ret void 425} 426 427; 428; zext i8 -> i16 429; 430 431define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 { 432; CHECK-LABEL: zext_v16i8_v16i16: 433; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 434; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b 435; CHECK-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x0] 436; CHECK-NEXT: ret 437 %b = zext <16 x i8> %a to <16 x i16> 438 store <16 x i16>%b, <16 x i16>* %out 439 ret void 440} 441 442; NOTE: Extra 'add' is to prevent the extend being combined with the load. 443define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 { 444; CHECK-LABEL: zext_v32i8_v32i16: 445; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 446; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 447; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32 448; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] 449; VBITS_GE_512-NEXT: ret 450 %a = load <32 x i8>, <32 x i8>* %in 451 %b = add <32 x i8> %a, %a 452 %c = zext <32 x i8> %b to <32 x i16> 453 store <32 x i16> %c, <32 x i16>* %out 454 ret void 455} 456 457define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 { 458; CHECK-LABEL: zext_v64i8_v64i16: 459; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 460; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 461; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64 462; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] 463; VBITS_GE_1024-NEXT: ret 464 %a = load <64 x i8>, <64 x i8>* %in 465 %b = add <64 x i8> %a, %a 466 %c = zext <64 x i8> %b to <64 x i16> 467 store <64 x i16> %c, <64 x i16>* %out 468 ret void 469} 470 471define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 { 472; CHECK-LABEL: zext_v128i8_v128i16: 473; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 474; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 475; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128 476; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] 477; VBITS_GE_2048-NEXT: ret 478 %a = load <128 x i8>, <128 x i8>* %in 479 %b = add <128 x i8> %a, %a 480 %c = zext <128 x i8> %b to <128 x i16> 481 store <128 x i16> %c, <128 x i16>* %out 482 ret void 483} 484 485; 486; zext i8 -> i32 487; 488 489define void @zext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 { 490; CHECK-LABEL: zext_v8i8_v8i32: 491; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 492; CHECK-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b 493; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 494; CHECK-NEXT: st1w { [[A_HALFS]].s }, [[PG]], [x0] 495; CHECK-NEXT: ret 496 %b = zext <8 x i8> %a to <8 x i32> 497 store <8 x i32>%b, <8 x i32>* %out 498 ret void 499} 500 501define void @zext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 { 502; CHECK-LABEL: zext_v16i8_v16i32: 503; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 504; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 505; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 506; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0] 507; VBITS_GE_512-NEXT: ret 508 509; Ensure sensible type legalisation. 510; VBITS_EQ_256: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8 511; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b 512; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b 513; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h 514; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_HI:z[0-9]+]].s, [[A_HALFS_HI]].h 515; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 516; VBITS_EQ_256-DAG: add x[[OUT_HI:[0-9]+]], x0, #32 517; VBITS_EQ_256-DAG: st1w { [[A_WORDS_LO]].s }, [[PG]], [x0] 518; VBITS_EQ_256-DAG: st1w { [[A_WORDS_HI]].s }, [[PG]], [x[[OUT_HI]]] 519; VBITS_EQ_256-NEXT: ret 520 %b = zext <16 x i8> %a to <16 x i32> 521 store <16 x i32> %b, <16 x i32>* %out 522 ret void 523} 524 525define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 { 526; CHECK-LABEL: zext_v32i8_v32i32: 527; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 528; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 529; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 530; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 531; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 532; VBITS_GE_1024-NEXT: ret 533 %a = load <32 x i8>, <32 x i8>* %in 534 %b = add <32 x i8> %a, %a 535 %c = zext <32 x i8> %b to <32 x i32> 536 store <32 x i32> %c, <32 x i32>* %out 537 ret void 538} 539 540define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 { 541; CHECK-LABEL: zext_v64i8_v64i32: 542; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 543; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 544; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 545; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 546; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 547; VBITS_GE_2048-NEXT: ret 548 %a = load <64 x i8>, <64 x i8>* %in 549 %b = add <64 x i8> %a, %a 550 %c = zext <64 x i8> %b to <64 x i32> 551 store <64 x i32> %c, <64 x i32>* %out 552 ret void 553} 554 555; 556; zext i8 -> i64 557; 558 559; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The zero 560; extend is a two step process where the container is zero_extend_inreg'd with 561; the result feeding a normal zero extend from halfs to doublewords. 562define void @zext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 { 563; CHECK-LABEL: zext_v4i8_v4i64: 564; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 565; CHECK-NEXT: bic v0.4h, #255, lsl #8 566; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h 567; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 568; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 569; CHECK-NEXT: ret 570 %b = zext <4 x i8> %a to <4 x i64> 571 store <4 x i64>%b, <4 x i64>* %out 572 ret void 573} 574 575define void @zext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 { 576; CHECK-LABEL: zext_v8i8_v8i64: 577; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 578; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 579; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 580; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 581; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 582; VBITS_GE_512-NEXT: ret 583 %b = zext <8 x i8> %a to <8 x i64> 584 store <8 x i64>%b, <8 x i64>* %out 585 ret void 586} 587 588define void @zext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 { 589; CHECK-LABEL: zext_v16i8_v16i64: 590; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 591; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 592; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 593; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 594; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 595; VBITS_GE_1024-NEXT: ret 596 %b = zext <16 x i8> %a to <16 x i64> 597 store <16 x i64> %b, <16 x i64>* %out 598 ret void 599} 600 601define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 { 602; CHECK-LABEL: zext_v32i8_v32i64: 603; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b 604; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b 605; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 606; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 607; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 608; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 609; VBITS_GE_2048-NEXT: ret 610 %a = load <32 x i8>, <32 x i8>* %in 611 %b = add <32 x i8> %a, %a 612 %c = zext <32 x i8> %b to <32 x i64> 613 store <32 x i64> %c, <32 x i64>* %out 614 ret void 615} 616 617; 618; zext i16 -> i32 619; 620 621define void @zext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 { 622; CHECK-LABEL: zext_v8i16_v8i32: 623; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 624; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h 625; CHECK-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0] 626; CHECK-NEXT: ret 627 %b = zext <8 x i16> %a to <8 x i32> 628 store <8 x i32>%b, <8 x i32>* %out 629 ret void 630} 631 632define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 { 633; CHECK-LABEL: zext_v16i16_v16i32: 634; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 635; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 636; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16 637; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 638; VBITS_GE_512-NEXT: ret 639 %a = load <16 x i16>, <16 x i16>* %in 640 %b = add <16 x i16> %a, %a 641 %c = zext <16 x i16> %b to <16 x i32> 642 store <16 x i32> %c, <16 x i32>* %out 643 ret void 644} 645 646define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 { 647; CHECK-LABEL: zext_v32i16_v32i32: 648; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 649; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 650; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 651; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 652; VBITS_GE_1024-NEXT: ret 653 %a = load <32 x i16>, <32 x i16>* %in 654 %b = add <32 x i16> %a, %a 655 %c = zext <32 x i16> %b to <32 x i32> 656 store <32 x i32> %c, <32 x i32>* %out 657 ret void 658} 659 660define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 { 661; CHECK-LABEL: zext_v64i16_v64i32: 662; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 663; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 664; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 665; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] 666; VBITS_GE_2048-NEXT: ret 667 %a = load <64 x i16>, <64 x i16>* %in 668 %b = add <64 x i16> %a, %a 669 %c = zext <64 x i16> %b to <64 x i32> 670 store <64 x i32> %c, <64 x i32>* %out 671 ret void 672} 673 674; 675; zext i16 -> i64 676; 677 678define void @zext_v4i16_v4i64(<4 x i16> %a, <4 x i64>* %out) #0 { 679; CHECK-LABEL: zext_v4i16_v4i64: 680; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 681; CHECK-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h 682; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 683; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 684; CHECK-NEXT: ret 685 %b = zext <4 x i16> %a to <4 x i64> 686 store <4 x i64>%b, <4 x i64>* %out 687 ret void 688} 689 690define void @zext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 { 691; CHECK-LABEL: zext_v8i16_v8i64: 692; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 693; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, z0.h 694; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 695; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 696; VBITS_GE_512-NEXT: ret 697 %b = zext <8 x i16> %a to <8 x i64> 698 store <8 x i64>%b, <8 x i64>* %out 699 ret void 700} 701 702define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 { 703; CHECK-LABEL: zext_v16i16_v16i64: 704; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 705; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 706; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 707; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 708; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 709; VBITS_GE_1024-NEXT: ret 710 %a = load <16 x i16>, <16 x i16>* %in 711 %b = add <16 x i16> %a, %a 712 %c = zext <16 x i16> %b to <16 x i64> 713 store <16 x i64> %c, <16 x i64>* %out 714 ret void 715} 716 717define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 { 718; CHECK-LABEL: zext_v32i16_v32i64: 719; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h 720; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h 721; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 722; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 723; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 724; VBITS_GE_2048-NEXT: ret 725 %a = load <32 x i16>, <32 x i16>* %in 726 %b = add <32 x i16> %a, %a 727 %c = zext <32 x i16> %b to <32 x i64> 728 store <32 x i64> %c, <32 x i64>* %out 729 ret void 730} 731 732; 733; zext i32 -> i64 734; 735 736define void @zext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 { 737; CHECK-LABEL: zext_v4i32_v4i64: 738; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 739; CHECK-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, z0.s 740; CHECK-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] 741; CHECK-NEXT: ret 742 %b = zext <4 x i32> %a to <4 x i64> 743 store <4 x i64>%b, <4 x i64>* %out 744 ret void 745} 746 747define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 { 748; CHECK-LABEL: zext_v8i32_v8i64: 749; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s 750; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 751; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 752; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 753; VBITS_GE_512-NEXT: ret 754 %a = load <8 x i32>, <8 x i32>* %in 755 %b = add <8 x i32> %a, %a 756 %c = zext <8 x i32> %b to <8 x i64> 757 store <8 x i64> %c, <8 x i64>* %out 758 ret void 759} 760 761define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 { 762; CHECK-LABEL: zext_v16i32_v16i64: 763; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s 764; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 765; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 766; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 767; VBITS_GE_1024-NEXT: ret 768 %a = load <16 x i32>, <16 x i32>* %in 769 %b = add <16 x i32> %a, %a 770 %c = zext <16 x i32> %b to <16 x i64> 771 store <16 x i64> %c, <16 x i64>* %out 772 ret void 773} 774 775define void @zext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 { 776; CHECK-LABEL: zext_v32i32_v32i64: 777; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s 778; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s 779; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 780; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] 781; VBITS_GE_2048-NEXT: ret 782 %a = load <32 x i32>, <32 x i32>* %in 783 %b = add <32 x i32> %a, %a 784 %c = zext <32 x i32> %b to <32 x i64> 785 store <32 x i64> %c, <32 x i64>* %out 786 ret void 787} 788 789attributes #0 = { nounwind "target-features"="+sve" } 790