1; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s 2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t 3 4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. 5; WARN-NOT: warning 6 7; 8; LDFF1B, LDFF1W, LDFF1H, LDFF1D: base + 32-bit unscaled offset, sign (sxtw) or zero 9; (uxtw) extended to 64 bits. 10; e.g. ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw] 11; 12 13; LDFF1B 14define <vscale x 4 x i32> @gldff1b_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) { 15; CHECK-LABEL: gldff1b_s_uxtw: 16; CHECK: ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] 17; CHECK-NEXT: ret 18 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg, 19 i8* %base, 20 <vscale x 4 x i32> %b) 21 %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32> 22 ret <vscale x 4 x i32> %res 23} 24 25define <vscale x 4 x i32> @gldff1b_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) { 26; CHECK-LABEL: gldff1b_s_sxtw: 27; CHECK: ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] 28; CHECK-NEXT: ret 29 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg, 30 i8* %base, 31 <vscale x 4 x i32> %b) 32 %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32> 33 ret <vscale x 4 x i32> %res 34} 35 36define <vscale x 2 x i64> @gldff1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) { 37; CHECK-LABEL: gldff1b_d_uxtw: 38; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d, uxtw] 39; CHECK-NEXT: ret 40 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg, 41 i8* %base, 42 <vscale x 2 x i32> %b) 43 %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64> 44 ret <vscale x 2 x i64> %res 45} 46 47define <vscale x 2 x i64> @gldff1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) { 48; CHECK-LABEL: gldff1b_d_sxtw: 49; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d, sxtw] 50; CHECK-NEXT: ret 51 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg, 52 i8* %base, 53 <vscale x 2 x i32> %b) 54 %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64> 55 ret <vscale x 2 x i64> %res 56} 57 58; LDFF1H 59define <vscale x 4 x i32> @gldff1h_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) { 60; CHECK-LABEL: gldff1h_s_uxtw: 61; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] 62; CHECK-NEXT: ret 63 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg, 64 i16* %base, 65 <vscale x 4 x i32> %b) 66 %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32> 67 ret <vscale x 4 x i32> %res 68} 69 70define <vscale x 4 x i32> @gldff1h_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) { 71; CHECK-LABEL: gldff1h_s_sxtw: 72; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] 73; CHECK-NEXT: ret 74 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg, 75 i16* %base, 76 <vscale x 4 x i32> %b) 77 %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32> 78 ret <vscale x 4 x i32> %res 79} 80 81define <vscale x 2 x i64> @gldff1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) { 82; CHECK-LABEL: gldff1h_d_uxtw: 83; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw] 84; CHECK-NEXT: ret 85 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg, 86 i16* %base, 87 <vscale x 2 x i32> %b) 88 %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64> 89 ret <vscale x 2 x i64> %res 90} 91 92define <vscale x 2 x i64> @gldff1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) { 93; CHECK-LABEL: gldff1h_d_sxtw: 94; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, sxtw] 95; CHECK-NEXT: ret 96 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg, 97 i16* %base, 98 <vscale x 2 x i32> %b) 99 %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64> 100 ret <vscale x 2 x i64> %res 101} 102 103; LDFF1W 104define <vscale x 4 x i32> @gldff1w_s_uxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) { 105; CHECK-LABEL: gldff1w_s_uxtw: 106; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] 107; CHECK-NEXT: ret 108 %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(<vscale x 4 x i1> %pg, 109 i32* %base, 110 <vscale x 4 x i32> %b) 111 ret <vscale x 4 x i32> %load 112} 113 114define <vscale x 4 x i32> @gldff1w_s_sxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) { 115; CHECK-LABEL: gldff1w_s_sxtw: 116; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] 117; CHECK-NEXT: ret 118 %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(<vscale x 4 x i1> %pg, 119 i32* %base, 120 <vscale x 4 x i32> %b) 121 ret <vscale x 4 x i32> %load 122} 123 124define <vscale x 2 x i64> @gldff1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) { 125; CHECK-LABEL: gldff1w_d_uxtw: 126; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw] 127; CHECK-NEXT: ret 128 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg, 129 i32* %base, 130 <vscale x 2 x i32> %b) 131 %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64> 132 ret <vscale x 2 x i64> %res 133} 134 135define <vscale x 2 x i64> @gldff1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) { 136; CHECK-LABEL: gldff1w_d_sxtw: 137; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, sxtw] 138; CHECK-NEXT: ret 139 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg, 140 i32* %base, 141 <vscale x 2 x i32> %b) 142 %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64> 143 ret <vscale x 2 x i64> %res 144} 145 146define <vscale x 4 x float> @gldff1w_s_uxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) { 147; CHECK-LABEL: gldff1w_s_uxtw_float: 148; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] 149; CHECK-NEXT: ret 150 %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4f32(<vscale x 4 x i1> %pg, 151 float* %base, 152 <vscale x 4 x i32> %b) 153 ret <vscale x 4 x float> %load 154} 155 156define <vscale x 4 x float> @gldff1w_s_sxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) { 157; CHECK-LABEL: gldff1w_s_sxtw_float: 158; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] 159; CHECK-NEXT: ret 160 %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4f32(<vscale x 4 x i1> %pg, 161 float* %base, 162 <vscale x 4 x i32> %b) 163 ret <vscale x 4 x float> %load 164} 165 166; LDFF1D 167define <vscale x 2 x i64> @gldff1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) { 168; CHECK-LABEL: gldff1d_d_uxtw: 169; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw] 170; CHECK-NEXT: ret 171 %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i64(<vscale x 2 x i1> %pg, 172 i64* %base, 173 <vscale x 2 x i32> %b) 174 ret <vscale x 2 x i64> %load 175} 176 177define <vscale x 2 x i64> @gldff1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) { 178; CHECK-LABEL: gldff1d_d_sxtw: 179; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw] 180; CHECK-NEXT: ret 181 %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i64(<vscale x 2 x i1> %pg, 182 i64* %base, 183 <vscale x 2 x i32> %b) 184 ret <vscale x 2 x i64> %load 185} 186 187define <vscale x 2 x double> @gldff1d_d_uxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) { 188; CHECK-LABEL: gldff1d_d_uxtw_double: 189; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw] 190; CHECK-NEXT: ret 191 %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2f64(<vscale x 2 x i1> %pg, 192 double* %base, 193 <vscale x 2 x i32> %b) 194 ret <vscale x 2 x double> %load 195} 196 197define <vscale x 2 x double> @gldff1d_d_sxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) { 198; CHECK-LABEL: gldff1d_d_sxtw_double: 199; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw] 200; CHECK-NEXT: ret 201 %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2f64(<vscale x 2 x i1> %pg, 202 double* %base, 203 <vscale x 2 x i32> %b) 204 ret <vscale x 2 x double> %load 205} 206 207; 208; LDFF1SB, LDFF1SW, LDFF1SH: base + 32-bit unscaled offset, sign (sxtw) or zero 209; (uxtw) extended to 64 bits. 210; e.g. ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw] 211; 212 213; LDFF1SB 214define <vscale x 4 x i32> @gldff1sb_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) { 215; CHECK-LABEL: gldff1sb_s_uxtw: 216; CHECK: ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] 217; CHECK-NEXT: ret 218 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg, 219 i8* %base, 220 <vscale x 4 x i32> %b) 221 %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> 222 ret <vscale x 4 x i32> %res 223} 224 225define <vscale x 4 x i32> @gldff1sb_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) { 226; CHECK-LABEL: gldff1sb_s_sxtw: 227; CHECK: ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] 228; CHECK-NEXT: ret 229 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg, 230 i8* %base, 231 <vscale x 4 x i32> %b) 232 %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> 233 ret <vscale x 4 x i32> %res 234} 235 236define <vscale x 2 x i64> @gldff1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) { 237; CHECK-LABEL: gldff1sb_d_uxtw: 238; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d, uxtw] 239; CHECK-NEXT: ret 240 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg, 241 i8* %base, 242 <vscale x 2 x i32> %b) 243 %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> 244 ret <vscale x 2 x i64> %res 245} 246 247define <vscale x 2 x i64> @gldff1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) { 248; CHECK-LABEL: gldff1sb_d_sxtw: 249; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d, sxtw] 250; CHECK-NEXT: ret 251 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg, 252 i8* %base, 253 <vscale x 2 x i32> %b) 254 %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> 255 ret <vscale x 2 x i64> %res 256} 257 258; LDFF1SH 259define <vscale x 4 x i32> @gldff1sh_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) { 260; CHECK-LABEL: gldff1sh_s_uxtw: 261; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] 262; CHECK-NEXT: ret 263 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg, 264 i16* %base, 265 <vscale x 4 x i32> %b) 266 %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> 267 ret <vscale x 4 x i32> %res 268} 269 270define <vscale x 4 x i32> @gldff1sh_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) { 271; CHECK-LABEL: gldff1sh_s_sxtw: 272; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] 273; CHECK-NEXT: ret 274 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg, 275 i16* %base, 276 <vscale x 4 x i32> %b) 277 %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> 278 ret <vscale x 4 x i32> %res 279} 280 281define <vscale x 2 x i64> @gldff1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) { 282; CHECK-LABEL: gldff1sh_d_uxtw: 283; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw] 284; CHECK-NEXT: ret 285 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg, 286 i16* %base, 287 <vscale x 2 x i32> %b) 288 %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> 289 ret <vscale x 2 x i64> %res 290} 291 292define <vscale x 2 x i64> @gldff1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) { 293; CHECK-LABEL: gldff1sh_d_sxtw: 294; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, sxtw] 295; CHECK-NEXT: ret 296 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg, 297 i16* %base, 298 <vscale x 2 x i32> %b) 299 %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> 300 ret <vscale x 2 x i64> %res 301} 302 303; LDFF1SW 304define <vscale x 2 x i64> @gldff1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) { 305; CHECK-LABEL: gldff1sw_d_uxtw: 306; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, uxtw] 307; CHECK-NEXT: ret 308 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg, 309 i32* %base, 310 <vscale x 2 x i32> %b) 311 %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> 312 ret <vscale x 2 x i64> %res 313} 314 315define <vscale x 2 x i64> @gldff1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) { 316; CHECK-LABEL: gldff1sw_d_sxtw: 317; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, sxtw] 318; CHECK-NEXT: ret 319 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg, 320 i32* %base, 321 <vscale x 2 x i32> %b) 322 %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> 323 ret <vscale x 2 x i64> %res 324} 325 326; LDFF1B/LDFF1SB 327declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>) 328declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>) 329declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>) 330declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>) 331 332; LDFF1H/LDFF1SH 333declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>) 334declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>) 335declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>) 336declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>) 337 338; LDFF1W/LDFF1SW 339declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>) 340declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>) 341declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>) 342declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>) 343 344declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>) 345declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>) 346 347; LDFF1D 348declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>) 349declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>) 350 351declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>) 352declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>) 353