1; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK 3; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK 4; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 17 18target triple = "aarch64-unknown-linux-gnu" 19 20; Don't use SVE when its registers are no bigger than NEON. 21; NO_SVE-NOT: z{0-9} 22 23; 24; truncate i16 -> i8 25; 26 27define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 { 28; CHECK-LABEL: trunc_v16i16_v16i8: 29; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 30; CHECK-NEXT: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] 31; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b 32; CHECK-NEXT: ret 33 %a = load <16 x i16>, <16 x i16>* %in 34 %b = trunc <16 x i16> %a to <16 x i8> 35 ret <16 x i8> %b 36} 37 38; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 39define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 { 40; CHECK-LABEL: trunc_v32i16_v32i8: 41; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 42; VBITS_GE_512: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] 43; VBITS_GE_512: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b 44; VBITS_GE_512: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b 45 %a = load <32 x i16>, <32 x i16>* %in 46 %b = trunc <32 x i16> %a to <32 x i8> 47 %c = add <32 x i8> %b, %b 48 store <32 x i8> %c, <32 x i8>* %out 49 ret void 50} 51 52; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 53define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 { 54; CHECK-LABEL: trunc_v64i16_v64i8: 55; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 56; VBITS_GE_1024: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] 57; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b 58; VBITS_GE_1024: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b 59 %a = load <64 x i16>, <64 x i16>* %in 60 %b = trunc <64 x i16> %a to <64 x i8> 61 %c = add <64 x i8> %b, %b 62 store <64 x i8> %c, <64 x i8>* %out 63 ret void 64} 65 66; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 67define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 { 68; CHECK-LABEL: trunc_v128i16_v128i8: 69; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 70; VBITS_GE_2048: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] 71; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b 72; VBITS_GE_2048: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b 73 %a = load <128 x i16>, <128 x i16>* %in 74 %b = trunc <128 x i16> %a to <128 x i8> 75 %c = add <128 x i8> %b, %b 76 store <128 x i8> %c, <128 x i8>* %out 77 ret void 78} 79 80; 81; truncate i32 -> i8 82; 83 84define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) #0 { 85; CHECK-LABEL: trunc_v8i32_v8i8: 86; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 87; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] 88; CHECK-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 89; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b 90; CHECK-NEXT: ret 91 %a = load <8 x i32>, <8 x i32>* %in 92 %b = trunc <8 x i32> %a to <8 x i8> 93 ret <8 x i8> %b 94} 95 96define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %in) #0 { 97; CHECK-LABEL: trunc_v16i32_v16i8: 98; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 99; VBITS_GE_512-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] 100; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 101; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b 102; VBITS_GE_512-NEXT: ret 103 %a = load <16 x i32>, <16 x i32>* %in 104 %b = trunc <16 x i32> %a to <16 x i8> 105 ret <16 x i8> %b 106} 107 108; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 109define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 { 110; CHECK-LABEL: trunc_v32i32_v32i8: 111; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 112; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] 113; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 114; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b 115; VBITS_GE_1024: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b 116 %a = load <32 x i32>, <32 x i32>* %in 117 %b = trunc <32 x i32> %a to <32 x i8> 118 %c = add <32 x i8> %b, %b 119 store <32 x i8> %c, <32 x i8>* %out 120 ret void 121} 122 123; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 124define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 { 125; CHECK-LABEL: trunc_v64i32_v64i8: 126; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 127; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] 128; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 129; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b 130; VBITS_GE_2048: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b 131 %a = load <64 x i32>, <64 x i32>* %in 132 %b = trunc <64 x i32> %a to <64 x i8> 133 %c = add <64 x i8> %b, %b 134 store <64 x i8> %c, <64 x i8>* %out 135 ret void 136} 137 138; 139; truncate i32 -> i16 140; 141 142define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 { 143; CHECK-LABEL: trunc_v8i32_v8i16: 144; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 145; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] 146; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h 147; CHECK-NEXT: ret 148 %a = load <8 x i32>, <8 x i32>* %in 149 %b = trunc <8 x i32> %a to <8 x i16> 150 ret <8 x i16> %b 151} 152 153; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 154define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 { 155; CHECK-LABEL: trunc_v16i32_v16i16: 156; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 157; VBITS_GE_512: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] 158; VBITS_GE_512: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 159; VBITS_GE_512: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h 160 %a = load <16 x i32>, <16 x i32>* %in 161 %b = trunc <16 x i32> %a to <16 x i16> 162 %c = add <16 x i16> %b, %b 163 store <16 x i16> %c, <16 x i16>* %out 164 ret void 165} 166 167; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 168define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 { 169; CHECK-LABEL: trunc_v32i32_v32i16: 170; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 171; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] 172; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 173; VBITS_GE_1024: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h 174 %a = load <32 x i32>, <32 x i32>* %in 175 %b = trunc <32 x i32> %a to <32 x i16> 176 %c = add <32 x i16> %b, %b 177 store <32 x i16> %c, <32 x i16>* %out 178 ret void 179} 180 181; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 182define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 { 183; CHECK-LABEL: trunc_v64i32_v64i16: 184; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 185; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] 186; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 187; VBITS_GE_2048: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h 188 %a = load <64 x i32>, <64 x i32>* %in 189 %b = trunc <64 x i32> %a to <64 x i16> 190 %c = add <64 x i16> %b, %b 191 store <64 x i16> %c, <64 x i16>* %out 192 ret void 193} 194 195; 196; truncate i64 -> i8 197; 198 199; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers. 200define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) #0 { 201; CHECK-LABEL: trunc_v4i64_v4i8: 202; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 203; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 204; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 205; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h 206; CHECK-NEXT: ret 207 %a = load <4 x i64>, <4 x i64>* %in 208 %b = trunc <4 x i64> %a to <4 x i8> 209 ret <4 x i8> %b 210} 211 212define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %in) #0 { 213; CHECK-LABEL: trunc_v8i64_v8i8: 214; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 215; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 216; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 217; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 218; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b 219; VBITS_GE_512-NEXT: ret 220 %a = load <8 x i64>, <8 x i64>* %in 221 %b = trunc <8 x i64> %a to <8 x i8> 222 ret <8 x i8> %b 223} 224 225define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) #0 { 226; CHECK-LABEL: trunc_v16i64_v16i8: 227; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 228; VBITS_GE_1024-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 229; VBITS_GE_1024-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 230; VBITS_GE_1024-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 231; VBITS_GE_1024-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b 232; VBITS_GE_1024-NEXT: ret 233 %a = load <16 x i64>, <16 x i64>* %in 234 %b = trunc <16 x i64> %a to <16 x i8> 235 ret <16 x i8> %b 236} 237 238; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 239define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 { 240; CHECK-LABEL: trunc_v32i64_v32i8: 241; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 242; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 243; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 244; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 245; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b 246; VBITS_GE_2048: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b 247 %a = load <32 x i64>, <32 x i64>* %in 248 %b = trunc <32 x i64> %a to <32 x i8> 249 %c = add <32 x i8> %b, %b 250 store <32 x i8> %c, <32 x i8>* %out 251 ret void 252} 253 254; 255; truncate i64 -> i16 256; 257 258define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) #0 { 259; CHECK-LABEL: trunc_v4i64_v4i16: 260; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 261; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 262; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 263; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h 264; CHECK-NEXT: ret 265 %a = load <4 x i64>, <4 x i64>* %in 266 %b = trunc <4 x i64> %a to <4 x i16> 267 ret <4 x i16> %b 268} 269 270define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %in) #0 { 271; CHECK-LABEL: trunc_v8i64_v8i16: 272; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 273; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 274; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 275; VBITS_GE_512-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h 276; VBITS_GE_512-NEXT: ret 277 %a = load <8 x i64>, <8 x i64>* %in 278 %b = trunc <8 x i64> %a to <8 x i16> 279 ret <8 x i16> %b 280} 281 282; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 283define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 { 284; CHECK-LABEL: trunc_v16i64_v16i16: 285; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 286; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 287; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 288; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 289; VBITS_GE_1024: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h 290 %a = load <16 x i64>, <16 x i64>* %in 291 %b = trunc <16 x i64> %a to <16 x i16> 292 %c = add <16 x i16> %b, %b 293 store <16 x i16> %c, <16 x i16>* %out 294 ret void 295} 296 297; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 298define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 { 299; CHECK-LABEL: trunc_v32i64_v32i16: 300; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 301; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 302; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 303; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h 304; VBITS_GE_2048: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h 305 %a = load <32 x i64>, <32 x i64>* %in 306 %b = trunc <32 x i64> %a to <32 x i16> 307 %c = add <32 x i16> %b, %b 308 store <32 x i16> %c, <32 x i16>* %out 309 ret void 310} 311 312; 313; truncate i64 -> i32 314; 315 316define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 { 317; CHECK-LABEL: trunc_v4i64_v4i32: 318; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 319; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 320; CHECK-NEXT: uzp1 z0.s, [[A_DWORDS]].s, [[A_DWORDS]].s 321; CHECK-NEXT: ret 322 %a = load <4 x i64>, <4 x i64>* %in 323 %b = trunc <4 x i64> %a to <4 x i32> 324 ret <4 x i32> %b 325} 326 327; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 328define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 { 329; CHECK-LABEL: trunc_v8i64_v8i32: 330; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 331; VBITS_GE_512: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 332; VBITS_GE_512: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 333; VBITS_GE_512: add [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, [[A_WORDS]].s 334 %a = load <8 x i64>, <8 x i64>* %in 335 %b = trunc <8 x i64> %a to <8 x i32> 336 %c = add <8 x i32> %b, %b 337 store <8 x i32> %c, <8 x i32>* %out 338 ret void 339} 340 341; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 342define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 { 343; CHECK-LABEL: trunc_v16i64_v16i32: 344; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 345; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 346; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 347; VBITS_GE_1024: add [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, [[A_WORDS]].s 348 %a = load <16 x i64>, <16 x i64>* %in 349 %b = trunc <16 x i64> %a to <16 x i32> 350 %c = add <16 x i32> %b, %b 351 store <16 x i32> %c, <16 x i32>* %out 352 ret void 353} 354 355; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 356define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) #0 { 357; CHECK-LABEL: trunc_v32i64_v32i32: 358; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 359; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] 360; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s 361; VBITS_GE_2048: add [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, [[A_WORDS]].s 362 %a = load <32 x i64>, <32 x i64>* %in 363 %b = trunc <32 x i64> %a to <32 x i32> 364 %c = add <32 x i32> %b, %b 365 store <32 x i32> %c, <32 x i32>* %out 366 ret void 367} 368 369attributes #0 = { nounwind "target-features"="+sve" } 370