1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s 3; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s 4 5;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 6; unscaled unpacked 32-bit offsets 7;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 8 9define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 10; CHECK-LABEL: masked_gather_nxv2i8: 11; CHECK: // %bb.0: 12; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] 13; CHECK-NEXT: ret 14 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 15 %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 16 %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef) 17 %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64> 18 ret <vscale x 2 x i64> %vals.zext 19} 20 21define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 22; CHECK-LABEL: masked_gather_nxv2i16: 23; CHECK: // %bb.0: 24; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] 25; CHECK-NEXT: ret 26 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 27 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 28 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*> 29 %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef) 30 %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64> 31 ret <vscale x 2 x i64> %vals.zext 32} 33 34define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 35; CHECK-LABEL: masked_gather_nxv2i32: 36; CHECK: // %bb.0: 37; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] 38; CHECK-NEXT: ret 39 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 40 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 41 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*> 42 %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef) 43 %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64> 44 ret <vscale x 2 x i64> %vals.zext 45} 46 47define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 48; CHECK-LABEL: masked_gather_nxv2i64: 49; CHECK: // %bb.0: 50; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] 51; CHECK-NEXT: ret 52 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 53 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 54 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*> 55 %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef) 56 ret <vscale x 2 x i64> %vals 57} 58 59define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 60; CHECK-LABEL: masked_gather_nxv2f16: 61; CHECK: // %bb.0: 62; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] 63; CHECK-NEXT: ret 64 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 65 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 66 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*> 67 %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef) 68 ret <vscale x 2 x half> %vals 69} 70 71define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 72; CHECK-LABEL: masked_gather_nxv2f32: 73; CHECK: // %bb.0: 74; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] 75; CHECK-NEXT: ret 76 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 77 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 78 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*> 79 %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef) 80 ret <vscale x 2 x float> %vals 81} 82 83define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 84; CHECK-LABEL: masked_gather_nxv2f64: 85; CHECK: // %bb.0: 86; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] 87; CHECK-NEXT: ret 88 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 89 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 90 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*> 91 %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef) 92 ret <vscale x 2 x double> %vals 93} 94 95define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 96; CHECK-LABEL: masked_sgather_nxv2i8: 97; CHECK: // %bb.0: 98; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw] 99; CHECK-NEXT: ret 100 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 101 %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 102 %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef) 103 %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64> 104 ret <vscale x 2 x i64> %vals.sext 105} 106 107define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 108; CHECK-LABEL: masked_sgather_nxv2i16: 109; CHECK: // %bb.0: 110; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw] 111; CHECK-NEXT: ret 112 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 113 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 114 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*> 115 %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef) 116 %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64> 117 ret <vscale x 2 x i64> %vals.sext 118} 119 120define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { 121; CHECK-LABEL: masked_sgather_nxv2i32: 122; CHECK: // %bb.0: 123; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw] 124; CHECK-NEXT: ret 125 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> 126 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext 127 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*> 128 %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef) 129 %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64> 130 ret <vscale x 2 x i64> %vals.sext 131} 132 133;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 134; unscaled packed 32-bit offsets 135;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 136 137define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { 138; CHECK-LABEL: masked_gather_nxv4i8: 139; CHECK: // %bb.0: 140; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] 141; CHECK-NEXT: ret 142 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> 143 %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext 144 %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef) 145 %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32> 146 ret <vscale x 4 x i32> %vals.zext 147} 148 149define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { 150; CHECK-LABEL: masked_gather_nxv4i16: 151; CHECK: // %bb.0: 152; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] 153; CHECK-NEXT: ret 154 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> 155 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext 156 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*> 157 %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef) 158 %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32> 159 ret <vscale x 4 x i32> %vals.zext 160} 161 162define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { 163; CHECK-LABEL: masked_gather_nxv4i32: 164; CHECK: // %bb.0: 165; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] 166; CHECK-NEXT: ret 167 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> 168 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext 169 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*> 170 %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef) 171 ret <vscale x 4 x i32> %vals 172} 173 174define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { 175; CHECK-LABEL: masked_gather_nxv4f16: 176; CHECK: // %bb.0: 177; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] 178; CHECK-NEXT: ret 179 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> 180 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext 181 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*> 182 %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef) 183 ret <vscale x 4 x half> %vals 184} 185 186define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { 187; CHECK-LABEL: masked_gather_nxv4f32: 188; CHECK: // %bb.0: 189; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] 190; CHECK-NEXT: ret 191 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> 192 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext 193 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*> 194 %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef) 195 ret <vscale x 4 x float> %vals 196} 197 198define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { 199; CHECK-LABEL: masked_sgather_nxv4i8: 200; CHECK: // %bb.0: 201; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw] 202; CHECK-NEXT: ret 203 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> 204 %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext 205 %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef) 206 %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32> 207 ret <vscale x 4 x i32> %vals.sext 208} 209 210define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { 211; CHECK-LABEL: masked_sgather_nxv4i16: 212; CHECK: // %bb.0: 213; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] 214; CHECK-NEXT: ret 215 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> 216 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext 217 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*> 218 %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef) 219 %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32> 220 ret <vscale x 4 x i32> %vals.sext 221} 222 223declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>) 224declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>) 225declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>) 226declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>) 227declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>) 228declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>) 229declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>) 230 231declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>) 232declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>) 233declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>) 234declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>) 235declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>) 236