1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 4 5define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) { 6; CHECK-LABEL: ldrwu32_4: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vldrw.u32 q0, [r2] 9; CHECK-NEXT: vpt.i32 ne, q0, zr 10; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] 11; CHECK-NEXT: vstrw.32 q0, [r1] 12; CHECK-NEXT: bx lr 13entry: 14 %z = getelementptr inbounds i8, i8* %x, i32 4 15 %0 = bitcast i8* %z to <4 x i32>* 16 %mask = load <4 x i32>, <4 x i32>* %m, align 4 17 %c = icmp ne <4 x i32> %mask, zeroinitializer 18 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) 19 %2 = bitcast i8* %y to <4 x i32>* 20 store <4 x i32> %1, <4 x i32>* %2, align 4 21 ret i8* %x 22} 23 24define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) { 25; CHECK-LABEL: ldrwu32_3: 26; CHECK: @ %bb.0: @ %entry 27; CHECK-NEXT: vldrw.u32 q0, [r2] 28; CHECK-NEXT: adds r3, r0, #3 29; CHECK-NEXT: vpt.i32 ne, q0, zr 30; CHECK-NEXT: vldrwt.u32 q0, [r3] 31; CHECK-NEXT: vstrw.32 q0, [r1] 32; CHECK-NEXT: bx lr 33entry: 34 %z = getelementptr inbounds i8, i8* %x, i32 3 35 %0 = bitcast i8* %z to <4 x i32>* 36 %mask = load <4 x i32>, <4 x i32>* %m, align 4 37 %c = icmp ne <4 x i32> %mask, zeroinitializer 38 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) 39 %2 = bitcast i8* %y to <4 x i32>* 40 store <4 x i32> %1, <4 x i32>* %2, align 4 41 ret i8* %x 42} 43 44define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) { 45; CHECK-LABEL: ldrwu32_2: 46; CHECK: @ %bb.0: @ %entry 47; CHECK-NEXT: vldrw.u32 q0, [r2] 48; CHECK-NEXT: adds r3, r0, #2 49; CHECK-NEXT: vpt.i32 ne, q0, zr 50; CHECK-NEXT: vldrwt.u32 q0, [r3] 51; CHECK-NEXT: vstrw.32 q0, [r1] 52; CHECK-NEXT: bx lr 53entry: 54 %z = getelementptr inbounds i8, i8* %x, i32 2 55 %0 = bitcast i8* %z to <4 x i32>* 56 %mask = load <4 x i32>, <4 x i32>* %m, align 4 57 %c = icmp ne <4 x i32> %mask, zeroinitializer 58 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) 59 %2 = bitcast i8* %y to <4 x i32>* 60 store <4 x i32> %1, <4 x i32>* %2, align 4 61 ret i8* %x 62} 63 64define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) { 65; CHECK-LABEL: ldrwu32_508: 66; CHECK: @ %bb.0: @ %entry 67; CHECK-NEXT: vldrw.u32 q0, [r2] 68; CHECK-NEXT: vpt.i32 ne, q0, zr 69; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] 70; CHECK-NEXT: vstrw.32 q0, [r1] 71; CHECK-NEXT: bx lr 72entry: 73 %z = getelementptr inbounds i8, i8* %x, i32 508 74 %0 = bitcast i8* %z to <4 x i32>* 75 %mask = load <4 x i32>, <4 x i32>* %m, align 4 76 %c = icmp ne <4 x i32> %mask, zeroinitializer 77 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) 78 %2 = bitcast i8* %y to <4 x i32>* 79 store <4 x i32> %1, <4 x i32>* %2, align 4 80 ret i8* %x 81} 82 83define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) { 84; CHECK-LABEL: ldrwu32_512: 85; CHECK: @ %bb.0: @ %entry 86; CHECK-NEXT: vldrw.u32 q0, [r2] 87; CHECK-NEXT: add.w r3, r0, #512 88; CHECK-NEXT: vpt.i32 ne, q0, zr 89; CHECK-NEXT: vldrwt.u32 q0, [r3] 90; CHECK-NEXT: vstrw.32 q0, [r1] 91; CHECK-NEXT: bx lr 92entry: 93 %z = getelementptr inbounds i8, i8* %x, i32 512 94 %0 = bitcast i8* %z to <4 x i32>* 95 %mask = load <4 x i32>, <4 x i32>* %m, align 4 96 %c = icmp ne <4 x i32> %mask, zeroinitializer 97 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) 98 %2 = bitcast i8* %y to <4 x i32>* 99 store <4 x i32> %1, <4 x i32>* %2, align 4 100 ret i8* %x 101} 102 103define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) { 104; CHECK-LABEL: ldrwu32_m508: 105; CHECK: @ %bb.0: @ %entry 106; CHECK-NEXT: vldrw.u32 q0, [r2] 107; CHECK-NEXT: vpt.i32 ne, q0, zr 108; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] 109; CHECK-NEXT: vstrw.32 q0, [r1] 110; CHECK-NEXT: bx lr 111entry: 112 %z = getelementptr inbounds i8, i8* %x, i32 -508 113 %0 = bitcast i8* %z to <4 x i32>* 114 %mask = load <4 x i32>, <4 x i32>* %m, align 4 115 %c = icmp ne <4 x i32> %mask, zeroinitializer 116 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) 117 %2 = bitcast i8* %y to <4 x i32>* 118 store <4 x i32> %1, <4 x i32>* %2, align 4 119 ret i8* %x 120} 121 122define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) { 123; CHECK-LABEL: ldrwu32_m512: 124; CHECK: @ %bb.0: @ %entry 125; CHECK-NEXT: vldrw.u32 q0, [r2] 126; CHECK-NEXT: sub.w r3, r0, #512 127; CHECK-NEXT: vpt.i32 ne, q0, zr 128; CHECK-NEXT: vldrwt.u32 q0, [r3] 129; CHECK-NEXT: vstrw.32 q0, [r1] 130; CHECK-NEXT: bx lr 131entry: 132 %z = getelementptr inbounds i8, i8* %x, i32 -512 133 %0 = bitcast i8* %z to <4 x i32>* 134 %mask = load <4 x i32>, <4 x i32>* %m, align 4 135 %c = icmp ne <4 x i32> %mask, zeroinitializer 136 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) 137 %2 = bitcast i8* %y to <4 x i32>* 138 store <4 x i32> %1, <4 x i32>* %2, align 4 139 ret i8* %x 140} 141 142define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) { 143; CHECK-LABEL: ldrhu32_4: 144; CHECK: @ %bb.0: @ %entry 145; CHECK-NEXT: vldrw.u32 q0, [r2] 146; CHECK-NEXT: vpt.i32 ne, q0, zr 147; CHECK-NEXT: vldrht.u32 q0, [r0, #4] 148; CHECK-NEXT: vstrw.32 q0, [r1] 149; CHECK-NEXT: bx lr 150entry: 151 %z = getelementptr inbounds i8, i8* %x, i32 4 152 %0 = bitcast i8* %z to <4 x i16>* 153 %mask = load <4 x i32>, <4 x i32>* %m, align 4 154 %c = icmp ne <4 x i32> %mask, zeroinitializer 155 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 156 %2 = zext <4 x i16> %1 to <4 x i32> 157 %3 = bitcast i8* %y to <4 x i32>* 158 store <4 x i32> %2, <4 x i32>* %3, align 4 159 ret i8* %x 160} 161 162define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) { 163; CHECK-LABEL: ldrhu32_3: 164; CHECK: @ %bb.0: @ %entry 165; CHECK-NEXT: vldrw.u32 q0, [r2] 166; CHECK-NEXT: adds r3, r0, #3 167; CHECK-NEXT: vpt.i32 ne, q0, zr 168; CHECK-NEXT: vldrht.u32 q0, [r3] 169; CHECK-NEXT: vstrw.32 q0, [r1] 170; CHECK-NEXT: bx lr 171entry: 172 %z = getelementptr inbounds i8, i8* %x, i32 3 173 %0 = bitcast i8* %z to <4 x i16>* 174 %mask = load <4 x i32>, <4 x i32>* %m, align 4 175 %c = icmp ne <4 x i32> %mask, zeroinitializer 176 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 177 %2 = zext <4 x i16> %1 to <4 x i32> 178 %3 = bitcast i8* %y to <4 x i32>* 179 store <4 x i32> %2, <4 x i32>* %3, align 4 180 ret i8* %x 181} 182 183define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) { 184; CHECK-LABEL: ldrhu32_2: 185; CHECK: @ %bb.0: @ %entry 186; CHECK-NEXT: vldrw.u32 q0, [r2] 187; CHECK-NEXT: vpt.i32 ne, q0, zr 188; CHECK-NEXT: vldrht.u32 q0, [r0, #2] 189; CHECK-NEXT: vstrw.32 q0, [r1] 190; CHECK-NEXT: bx lr 191entry: 192 %z = getelementptr inbounds i8, i8* %x, i32 2 193 %0 = bitcast i8* %z to <4 x i16>* 194 %mask = load <4 x i32>, <4 x i32>* %m, align 4 195 %c = icmp ne <4 x i32> %mask, zeroinitializer 196 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 197 %2 = zext <4 x i16> %1 to <4 x i32> 198 %3 = bitcast i8* %y to <4 x i32>* 199 store <4 x i32> %2, <4 x i32>* %3, align 4 200 ret i8* %x 201} 202 203define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) { 204; CHECK-LABEL: ldrhu32_254: 205; CHECK: @ %bb.0: @ %entry 206; CHECK-NEXT: vldrw.u32 q0, [r2] 207; CHECK-NEXT: vpt.i32 ne, q0, zr 208; CHECK-NEXT: vldrht.u32 q0, [r0, #254] 209; CHECK-NEXT: vstrw.32 q0, [r1] 210; CHECK-NEXT: bx lr 211entry: 212 %z = getelementptr inbounds i8, i8* %x, i32 254 213 %0 = bitcast i8* %z to <4 x i16>* 214 %mask = load <4 x i32>, <4 x i32>* %m, align 4 215 %c = icmp ne <4 x i32> %mask, zeroinitializer 216 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 217 %2 = zext <4 x i16> %1 to <4 x i32> 218 %3 = bitcast i8* %y to <4 x i32>* 219 store <4 x i32> %2, <4 x i32>* %3, align 4 220 ret i8* %x 221} 222 223define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) { 224; CHECK-LABEL: ldrhu32_256: 225; CHECK: @ %bb.0: @ %entry 226; CHECK-NEXT: vldrw.u32 q0, [r2] 227; CHECK-NEXT: add.w r3, r0, #256 228; CHECK-NEXT: vpt.i32 ne, q0, zr 229; CHECK-NEXT: vldrht.u32 q0, [r3] 230; CHECK-NEXT: vstrw.32 q0, [r1] 231; CHECK-NEXT: bx lr 232entry: 233 %z = getelementptr inbounds i8, i8* %x, i32 256 234 %0 = bitcast i8* %z to <4 x i16>* 235 %mask = load <4 x i32>, <4 x i32>* %m, align 4 236 %c = icmp ne <4 x i32> %mask, zeroinitializer 237 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 238 %2 = zext <4 x i16> %1 to <4 x i32> 239 %3 = bitcast i8* %y to <4 x i32>* 240 store <4 x i32> %2, <4 x i32>* %3, align 4 241 ret i8* %x 242} 243 244define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) { 245; CHECK-LABEL: ldrhu32_m254: 246; CHECK: @ %bb.0: @ %entry 247; CHECK-NEXT: vldrw.u32 q0, [r2] 248; CHECK-NEXT: vpt.i32 ne, q0, zr 249; CHECK-NEXT: vldrht.u32 q0, [r0, #-254] 250; CHECK-NEXT: vstrw.32 q0, [r1] 251; CHECK-NEXT: bx lr 252entry: 253 %z = getelementptr inbounds i8, i8* %x, i32 -254 254 %0 = bitcast i8* %z to <4 x i16>* 255 %mask = load <4 x i32>, <4 x i32>* %m, align 4 256 %c = icmp ne <4 x i32> %mask, zeroinitializer 257 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 258 %2 = zext <4 x i16> %1 to <4 x i32> 259 %3 = bitcast i8* %y to <4 x i32>* 260 store <4 x i32> %2, <4 x i32>* %3, align 4 261 ret i8* %x 262} 263 264define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) { 265; CHECK-LABEL: ldrhu32_m256: 266; CHECK: @ %bb.0: @ %entry 267; CHECK-NEXT: vldrw.u32 q0, [r2] 268; CHECK-NEXT: sub.w r3, r0, #256 269; CHECK-NEXT: vpt.i32 ne, q0, zr 270; CHECK-NEXT: vldrht.u32 q0, [r3] 271; CHECK-NEXT: vstrw.32 q0, [r1] 272; CHECK-NEXT: bx lr 273entry: 274 %z = getelementptr inbounds i8, i8* %x, i32 -256 275 %0 = bitcast i8* %z to <4 x i16>* 276 %mask = load <4 x i32>, <4 x i32>* %m, align 4 277 %c = icmp ne <4 x i32> %mask, zeroinitializer 278 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 279 %2 = zext <4 x i16> %1 to <4 x i32> 280 %3 = bitcast i8* %y to <4 x i32>* 281 store <4 x i32> %2, <4 x i32>* %3, align 4 282 ret i8* %x 283} 284 285define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) { 286; CHECK-LABEL: ldrhs32_4: 287; CHECK: @ %bb.0: @ %entry 288; CHECK-NEXT: vldrw.u32 q0, [r2] 289; CHECK-NEXT: vpt.i32 ne, q0, zr 290; CHECK-NEXT: vldrht.s32 q0, [r0, #4] 291; CHECK-NEXT: vstrw.32 q0, [r1] 292; CHECK-NEXT: bx lr 293entry: 294 %z = getelementptr inbounds i8, i8* %x, i32 4 295 %0 = bitcast i8* %z to <4 x i16>* 296 %mask = load <4 x i32>, <4 x i32>* %m, align 4 297 %c = icmp ne <4 x i32> %mask, zeroinitializer 298 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 299 %2 = sext <4 x i16> %1 to <4 x i32> 300 %3 = bitcast i8* %y to <4 x i32>* 301 store <4 x i32> %2, <4 x i32>* %3, align 4 302 ret i8* %x 303} 304 305define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) { 306; CHECK-LABEL: ldrhs32_3: 307; CHECK: @ %bb.0: @ %entry 308; CHECK-NEXT: vldrw.u32 q0, [r2] 309; CHECK-NEXT: adds r3, r0, #3 310; CHECK-NEXT: vpt.i32 ne, q0, zr 311; CHECK-NEXT: vldrht.s32 q0, [r3] 312; CHECK-NEXT: vstrw.32 q0, [r1] 313; CHECK-NEXT: bx lr 314entry: 315 %z = getelementptr inbounds i8, i8* %x, i32 3 316 %0 = bitcast i8* %z to <4 x i16>* 317 %mask = load <4 x i32>, <4 x i32>* %m, align 4 318 %c = icmp ne <4 x i32> %mask, zeroinitializer 319 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 320 %2 = sext <4 x i16> %1 to <4 x i32> 321 %3 = bitcast i8* %y to <4 x i32>* 322 store <4 x i32> %2, <4 x i32>* %3, align 4 323 ret i8* %x 324} 325 326define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) { 327; CHECK-LABEL: ldrhs32_2: 328; CHECK: @ %bb.0: @ %entry 329; CHECK-NEXT: vldrw.u32 q0, [r2] 330; CHECK-NEXT: vpt.i32 ne, q0, zr 331; CHECK-NEXT: vldrht.s32 q0, [r0, #2] 332; CHECK-NEXT: vstrw.32 q0, [r1] 333; CHECK-NEXT: bx lr 334entry: 335 %z = getelementptr inbounds i8, i8* %x, i32 2 336 %0 = bitcast i8* %z to <4 x i16>* 337 %mask = load <4 x i32>, <4 x i32>* %m, align 4 338 %c = icmp ne <4 x i32> %mask, zeroinitializer 339 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 340 %2 = sext <4 x i16> %1 to <4 x i32> 341 %3 = bitcast i8* %y to <4 x i32>* 342 store <4 x i32> %2, <4 x i32>* %3, align 4 343 ret i8* %x 344} 345 346define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) { 347; CHECK-LABEL: ldrhs32_254: 348; CHECK: @ %bb.0: @ %entry 349; CHECK-NEXT: vldrw.u32 q0, [r2] 350; CHECK-NEXT: vpt.i32 ne, q0, zr 351; CHECK-NEXT: vldrht.s32 q0, [r0, #254] 352; CHECK-NEXT: vstrw.32 q0, [r1] 353; CHECK-NEXT: bx lr 354entry: 355 %z = getelementptr inbounds i8, i8* %x, i32 254 356 %0 = bitcast i8* %z to <4 x i16>* 357 %mask = load <4 x i32>, <4 x i32>* %m, align 4 358 %c = icmp ne <4 x i32> %mask, zeroinitializer 359 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 360 %2 = sext <4 x i16> %1 to <4 x i32> 361 %3 = bitcast i8* %y to <4 x i32>* 362 store <4 x i32> %2, <4 x i32>* %3, align 4 363 ret i8* %x 364} 365 366define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) { 367; CHECK-LABEL: ldrhs32_256: 368; CHECK: @ %bb.0: @ %entry 369; CHECK-NEXT: vldrw.u32 q0, [r2] 370; CHECK-NEXT: add.w r3, r0, #256 371; CHECK-NEXT: vpt.i32 ne, q0, zr 372; CHECK-NEXT: vldrht.s32 q0, [r3] 373; CHECK-NEXT: vstrw.32 q0, [r1] 374; CHECK-NEXT: bx lr 375entry: 376 %z = getelementptr inbounds i8, i8* %x, i32 256 377 %0 = bitcast i8* %z to <4 x i16>* 378 %mask = load <4 x i32>, <4 x i32>* %m, align 4 379 %c = icmp ne <4 x i32> %mask, zeroinitializer 380 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 381 %2 = sext <4 x i16> %1 to <4 x i32> 382 %3 = bitcast i8* %y to <4 x i32>* 383 store <4 x i32> %2, <4 x i32>* %3, align 4 384 ret i8* %x 385} 386 387define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) { 388; CHECK-LABEL: ldrhs32_m254: 389; CHECK: @ %bb.0: @ %entry 390; CHECK-NEXT: vldrw.u32 q0, [r2] 391; CHECK-NEXT: vpt.i32 ne, q0, zr 392; CHECK-NEXT: vldrht.s32 q0, [r0, #-254] 393; CHECK-NEXT: vstrw.32 q0, [r1] 394; CHECK-NEXT: bx lr 395entry: 396 %z = getelementptr inbounds i8, i8* %x, i32 -254 397 %0 = bitcast i8* %z to <4 x i16>* 398 %mask = load <4 x i32>, <4 x i32>* %m, align 4 399 %c = icmp ne <4 x i32> %mask, zeroinitializer 400 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 401 %2 = sext <4 x i16> %1 to <4 x i32> 402 %3 = bitcast i8* %y to <4 x i32>* 403 store <4 x i32> %2, <4 x i32>* %3, align 4 404 ret i8* %x 405} 406 407define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) { 408; CHECK-LABEL: ldrhs32_m256: 409; CHECK: @ %bb.0: @ %entry 410; CHECK-NEXT: vldrw.u32 q0, [r2] 411; CHECK-NEXT: sub.w r3, r0, #256 412; CHECK-NEXT: vpt.i32 ne, q0, zr 413; CHECK-NEXT: vldrht.s32 q0, [r3] 414; CHECK-NEXT: vstrw.32 q0, [r1] 415; CHECK-NEXT: bx lr 416entry: 417 %z = getelementptr inbounds i8, i8* %x, i32 -256 418 %0 = bitcast i8* %z to <4 x i16>* 419 %mask = load <4 x i32>, <4 x i32>* %m, align 4 420 %c = icmp ne <4 x i32> %mask, zeroinitializer 421 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) 422 %2 = sext <4 x i16> %1 to <4 x i32> 423 %3 = bitcast i8* %y to <4 x i32>* 424 store <4 x i32> %2, <4 x i32>* %3, align 4 425 ret i8* %x 426} 427 428define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) { 429; CHECK-LABEL: ldrhu16_4: 430; CHECK: @ %bb.0: @ %entry 431; CHECK-NEXT: vldrh.u16 q0, [r2] 432; CHECK-NEXT: vpt.i16 ne, q0, zr 433; CHECK-NEXT: vldrht.u16 q0, [r0, #4] 434; CHECK-NEXT: vstrh.16 q0, [r1] 435; CHECK-NEXT: bx lr 436entry: 437 %z = getelementptr inbounds i8, i8* %x, i32 4 438 %0 = bitcast i8* %z to <8 x i16>* 439 %mask = load <8 x i16>, <8 x i16>* %m, align 2 440 %c = icmp ne <8 x i16> %mask, zeroinitializer 441 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) 442 %2 = bitcast i8* %y to <8 x i16>* 443 store <8 x i16> %1, <8 x i16>* %2, align 2 444 ret i8* %x 445} 446 447define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) { 448; CHECK-LABEL: ldrhu16_3: 449; CHECK: @ %bb.0: @ %entry 450; CHECK-NEXT: vldrh.u16 q0, [r2] 451; CHECK-NEXT: adds r3, r0, #3 452; CHECK-NEXT: vpt.i16 ne, q0, zr 453; CHECK-NEXT: vldrht.u16 q0, [r3] 454; CHECK-NEXT: vstrh.16 q0, [r1] 455; CHECK-NEXT: bx lr 456entry: 457 %z = getelementptr inbounds i8, i8* %x, i32 3 458 %0 = bitcast i8* %z to <8 x i16>* 459 %mask = load <8 x i16>, <8 x i16>* %m, align 2 460 %c = icmp ne <8 x i16> %mask, zeroinitializer 461 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) 462 %2 = bitcast i8* %y to <8 x i16>* 463 store <8 x i16> %1, <8 x i16>* %2, align 2 464 ret i8* %x 465} 466 467define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) { 468; CHECK-LABEL: ldrhu16_2: 469; CHECK: @ %bb.0: @ %entry 470; CHECK-NEXT: vldrh.u16 q0, [r2] 471; CHECK-NEXT: vpt.i16 ne, q0, zr 472; CHECK-NEXT: vldrht.u16 q0, [r0, #2] 473; CHECK-NEXT: vstrh.16 q0, [r1] 474; CHECK-NEXT: bx lr 475entry: 476 %z = getelementptr inbounds i8, i8* %x, i32 2 477 %0 = bitcast i8* %z to <8 x i16>* 478 %mask = load <8 x i16>, <8 x i16>* %m, align 2 479 %c = icmp ne <8 x i16> %mask, zeroinitializer 480 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) 481 %2 = bitcast i8* %y to <8 x i16>* 482 store <8 x i16> %1, <8 x i16>* %2, align 2 483 ret i8* %x 484} 485 486define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) { 487; CHECK-LABEL: ldrhu16_254: 488; CHECK: @ %bb.0: @ %entry 489; CHECK-NEXT: vldrh.u16 q0, [r2] 490; CHECK-NEXT: vpt.i16 ne, q0, zr 491; CHECK-NEXT: vldrht.u16 q0, [r0, #254] 492; CHECK-NEXT: vstrh.16 q0, [r1] 493; CHECK-NEXT: bx lr 494entry: 495 %z = getelementptr inbounds i8, i8* %x, i32 254 496 %0 = bitcast i8* %z to <8 x i16>* 497 %mask = load <8 x i16>, <8 x i16>* %m, align 2 498 %c = icmp ne <8 x i16> %mask, zeroinitializer 499 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) 500 %2 = bitcast i8* %y to <8 x i16>* 501 store <8 x i16> %1, <8 x i16>* %2, align 2 502 ret i8* %x 503} 504 505define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) { 506; CHECK-LABEL: ldrhu16_256: 507; CHECK: @ %bb.0: @ %entry 508; CHECK-NEXT: vldrh.u16 q0, [r2] 509; CHECK-NEXT: add.w r3, r0, #256 510; CHECK-NEXT: vpt.i16 ne, q0, zr 511; CHECK-NEXT: vldrht.u16 q0, [r3] 512; CHECK-NEXT: vstrh.16 q0, [r1] 513; CHECK-NEXT: bx lr 514entry: 515 %z = getelementptr inbounds i8, i8* %x, i32 256 516 %0 = bitcast i8* %z to <8 x i16>* 517 %mask = load <8 x i16>, <8 x i16>* %m, align 2 518 %c = icmp ne <8 x i16> %mask, zeroinitializer 519 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) 520 %2 = bitcast i8* %y to <8 x i16>* 521 store <8 x i16> %1, <8 x i16>* %2, align 2 522 ret i8* %x 523} 524 525define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) { 526; CHECK-LABEL: ldrhu16_m254: 527; CHECK: @ %bb.0: @ %entry 528; CHECK-NEXT: vldrh.u16 q0, [r2] 529; CHECK-NEXT: vpt.i16 ne, q0, zr 530; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] 531; CHECK-NEXT: vstrh.16 q0, [r1] 532; CHECK-NEXT: bx lr 533entry: 534 %z = getelementptr inbounds i8, i8* %x, i32 -254 535 %0 = bitcast i8* %z to <8 x i16>* 536 %mask = load <8 x i16>, <8 x i16>* %m, align 2 537 %c = icmp ne <8 x i16> %mask, zeroinitializer 538 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) 539 %2 = bitcast i8* %y to <8 x i16>* 540 store <8 x i16> %1, <8 x i16>* %2, align 2 541 ret i8* %x 542} 543 544define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) { 545; CHECK-LABEL: ldrhu16_m256: 546; CHECK: @ %bb.0: @ %entry 547; CHECK-NEXT: vldrh.u16 q0, [r2] 548; CHECK-NEXT: sub.w r3, r0, #256 549; CHECK-NEXT: vpt.i16 ne, q0, zr 550; CHECK-NEXT: vldrht.u16 q0, [r3] 551; CHECK-NEXT: vstrh.16 q0, [r1] 552; CHECK-NEXT: bx lr 553entry: 554 %z = getelementptr inbounds i8, i8* %x, i32 -256 555 %0 = bitcast i8* %z to <8 x i16>* 556 %mask = load <8 x i16>, <8 x i16>* %m, align 2 557 %c = icmp ne <8 x i16> %mask, zeroinitializer 558 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) 559 %2 = bitcast i8* %y to <8 x i16>* 560 store <8 x i16> %1, <8 x i16>* %2, align 2 561 ret i8* %x 562} 563 564define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) { 565; CHECK-LABEL: ldrbu32_4: 566; CHECK: @ %bb.0: @ %entry 567; CHECK-NEXT: vldrw.u32 q0, [r2] 568; CHECK-NEXT: vpt.i32 ne, q0, zr 569; CHECK-NEXT: vldrbt.u32 q0, [r0, #4] 570; CHECK-NEXT: vstrw.32 q0, [r1] 571; CHECK-NEXT: bx lr 572entry: 573 %z = getelementptr inbounds i8, i8* %x, i32 4 574 %0 = bitcast i8* %z to <4 x i8>* 575 %mask = load <4 x i32>, <4 x i32>* %m, align 4 576 %c = icmp ne <4 x i32> %mask, zeroinitializer 577 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 578 %2 = zext <4 x i8> %1 to <4 x i32> 579 %3 = bitcast i8* %y to <4 x i32>* 580 store <4 x i32> %2, <4 x i32>* %3, align 4 581 ret i8* %x 582} 583 584define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) { 585; CHECK-LABEL: ldrbu32_3: 586; CHECK: @ %bb.0: @ %entry 587; CHECK-NEXT: vldrw.u32 q0, [r2] 588; CHECK-NEXT: vpt.i32 ne, q0, zr 589; CHECK-NEXT: vldrbt.u32 q0, [r0, #3] 590; CHECK-NEXT: vstrw.32 q0, [r1] 591; CHECK-NEXT: bx lr 592entry: 593 %z = getelementptr inbounds i8, i8* %x, i32 3 594 %0 = bitcast i8* %z to <4 x i8>* 595 %mask = load <4 x i32>, <4 x i32>* %m, align 4 596 %c = icmp ne <4 x i32> %mask, zeroinitializer 597 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 598 %2 = zext <4 x i8> %1 to <4 x i32> 599 %3 = bitcast i8* %y to <4 x i32>* 600 store <4 x i32> %2, <4 x i32>* %3, align 4 601 ret i8* %x 602} 603 604define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) { 605; CHECK-LABEL: ldrbu32_2: 606; CHECK: @ %bb.0: @ %entry 607; CHECK-NEXT: vldrw.u32 q0, [r2] 608; CHECK-NEXT: vpt.i32 ne, q0, zr 609; CHECK-NEXT: vldrbt.u32 q0, [r0, #2] 610; CHECK-NEXT: vstrw.32 q0, [r1] 611; CHECK-NEXT: bx lr 612entry: 613 %z = getelementptr inbounds i8, i8* %x, i32 2 614 %0 = bitcast i8* %z to <4 x i8>* 615 %mask = load <4 x i32>, <4 x i32>* %m, align 4 616 %c = icmp ne <4 x i32> %mask, zeroinitializer 617 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 618 %2 = zext <4 x i8> %1 to <4 x i32> 619 %3 = bitcast i8* %y to <4 x i32>* 620 store <4 x i32> %2, <4 x i32>* %3, align 4 621 ret i8* %x 622} 623 624define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) { 625; CHECK-LABEL: ldrbu32_127: 626; CHECK: @ %bb.0: @ %entry 627; CHECK-NEXT: vldrw.u32 q0, [r2] 628; CHECK-NEXT: vpt.i32 ne, q0, zr 629; CHECK-NEXT: vldrbt.u32 q0, [r0, #127] 630; CHECK-NEXT: vstrw.32 q0, [r1] 631; CHECK-NEXT: bx lr 632entry: 633 %z = getelementptr inbounds i8, i8* %x, i32 127 634 %0 = bitcast i8* %z to <4 x i8>* 635 %mask = load <4 x i32>, <4 x i32>* %m, align 4 636 %c = icmp ne <4 x i32> %mask, zeroinitializer 637 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 638 %2 = zext <4 x i8> %1 to <4 x i32> 639 %3 = bitcast i8* %y to <4 x i32>* 640 store <4 x i32> %2, <4 x i32>* %3, align 4 641 ret i8* %x 642} 643 644define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) { 645; CHECK-LABEL: ldrbu32_128: 646; CHECK: @ %bb.0: @ %entry 647; CHECK-NEXT: vldrw.u32 q0, [r2] 648; CHECK-NEXT: add.w r3, r0, #128 649; CHECK-NEXT: vpt.i32 ne, q0, zr 650; CHECK-NEXT: vldrbt.u32 q0, [r3] 651; CHECK-NEXT: vstrw.32 q0, [r1] 652; CHECK-NEXT: bx lr 653entry: 654 %z = getelementptr inbounds i8, i8* %x, i32 128 655 %0 = bitcast i8* %z to <4 x i8>* 656 %mask = load <4 x i32>, <4 x i32>* %m, align 4 657 %c = icmp ne <4 x i32> %mask, zeroinitializer 658 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 659 %2 = zext <4 x i8> %1 to <4 x i32> 660 %3 = bitcast i8* %y to <4 x i32>* 661 store <4 x i32> %2, <4 x i32>* %3, align 4 662 ret i8* %x 663} 664 665define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) { 666; CHECK-LABEL: ldrbu32_m127: 667; CHECK: @ %bb.0: @ %entry 668; CHECK-NEXT: vldrw.u32 q0, [r2] 669; CHECK-NEXT: vpt.i32 ne, q0, zr 670; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127] 671; CHECK-NEXT: vstrw.32 q0, [r1] 672; CHECK-NEXT: bx lr 673entry: 674 %z = getelementptr inbounds i8, i8* %x, i32 -127 675 %0 = bitcast i8* %z to <4 x i8>* 676 %mask = load <4 x i32>, <4 x i32>* %m, align 4 677 %c = icmp ne <4 x i32> %mask, zeroinitializer 678 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 679 %2 = zext <4 x i8> %1 to <4 x i32> 680 %3 = bitcast i8* %y to <4 x i32>* 681 store <4 x i32> %2, <4 x i32>* %3, align 4 682 ret i8* %x 683} 684 685define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) { 686; CHECK-LABEL: ldrbu32_m128: 687; CHECK: @ %bb.0: @ %entry 688; CHECK-NEXT: vldrw.u32 q0, [r2] 689; CHECK-NEXT: sub.w r3, r0, #128 690; CHECK-NEXT: vpt.i32 ne, q0, zr 691; CHECK-NEXT: vldrbt.u32 q0, [r3] 692; CHECK-NEXT: vstrw.32 q0, [r1] 693; CHECK-NEXT: bx lr 694entry: 695 %z = getelementptr inbounds i8, i8* %x, i32 -128 696 %0 = bitcast i8* %z to <4 x i8>* 697 %mask = load <4 x i32>, <4 x i32>* %m, align 4 698 %c = icmp ne <4 x i32> %mask, zeroinitializer 699 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 700 %2 = zext <4 x i8> %1 to <4 x i32> 701 %3 = bitcast i8* %y to <4 x i32>* 702 store <4 x i32> %2, <4 x i32>* %3, align 4 703 ret i8* %x 704} 705 706define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) { 707; CHECK-LABEL: ldrbs32_4: 708; CHECK: @ %bb.0: @ %entry 709; CHECK-NEXT: vldrw.u32 q0, [r2] 710; CHECK-NEXT: vpt.i32 ne, q0, zr 711; CHECK-NEXT: vldrbt.s32 q0, [r0, #4] 712; CHECK-NEXT: vstrw.32 q0, [r1] 713; CHECK-NEXT: bx lr 714entry: 715 %z = getelementptr inbounds i8, i8* %x, i32 4 716 %0 = bitcast i8* %z to <4 x i8>* 717 %mask = load <4 x i32>, <4 x i32>* %m, align 4 718 %c = icmp ne <4 x i32> %mask, zeroinitializer 719 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 720 %2 = sext <4 x i8> %1 to <4 x i32> 721 %3 = bitcast i8* %y to <4 x i32>* 722 store <4 x i32> %2, <4 x i32>* %3, align 4 723 ret i8* %x 724} 725 726define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) { 727; CHECK-LABEL: ldrbs32_3: 728; CHECK: @ %bb.0: @ %entry 729; CHECK-NEXT: vldrw.u32 q0, [r2] 730; CHECK-NEXT: vpt.i32 ne, q0, zr 731; CHECK-NEXT: vldrbt.s32 q0, [r0, #3] 732; CHECK-NEXT: vstrw.32 q0, [r1] 733; CHECK-NEXT: bx lr 734entry: 735 %z = getelementptr inbounds i8, i8* %x, i32 3 736 %0 = bitcast i8* %z to <4 x i8>* 737 %mask = load <4 x i32>, <4 x i32>* %m, align 4 738 %c = icmp ne <4 x i32> %mask, zeroinitializer 739 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 740 %2 = sext <4 x i8> %1 to <4 x i32> 741 %3 = bitcast i8* %y to <4 x i32>* 742 store <4 x i32> %2, <4 x i32>* %3, align 4 743 ret i8* %x 744} 745 746define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) { 747; CHECK-LABEL: ldrbs32_2: 748; CHECK: @ %bb.0: @ %entry 749; CHECK-NEXT: vldrw.u32 q0, [r2] 750; CHECK-NEXT: vpt.i32 ne, q0, zr 751; CHECK-NEXT: vldrbt.s32 q0, [r0, #2] 752; CHECK-NEXT: vstrw.32 q0, [r1] 753; CHECK-NEXT: bx lr 754entry: 755 %z = getelementptr inbounds i8, i8* %x, i32 2 756 %0 = bitcast i8* %z to <4 x i8>* 757 %mask = load <4 x i32>, <4 x i32>* %m, align 4 758 %c = icmp ne <4 x i32> %mask, zeroinitializer 759 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 760 %2 = sext <4 x i8> %1 to <4 x i32> 761 %3 = bitcast i8* %y to <4 x i32>* 762 store <4 x i32> %2, <4 x i32>* %3, align 4 763 ret i8* %x 764} 765 766define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) { 767; CHECK-LABEL: ldrbs32_127: 768; CHECK: @ %bb.0: @ %entry 769; CHECK-NEXT: vldrw.u32 q0, [r2] 770; CHECK-NEXT: vpt.i32 ne, q0, zr 771; CHECK-NEXT: vldrbt.s32 q0, [r0, #127] 772; CHECK-NEXT: vstrw.32 q0, [r1] 773; CHECK-NEXT: bx lr 774entry: 775 %z = getelementptr inbounds i8, i8* %x, i32 127 776 %0 = bitcast i8* %z to <4 x i8>* 777 %mask = load <4 x i32>, <4 x i32>* %m, align 4 778 %c = icmp ne <4 x i32> %mask, zeroinitializer 779 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 780 %2 = sext <4 x i8> %1 to <4 x i32> 781 %3 = bitcast i8* %y to <4 x i32>* 782 store <4 x i32> %2, <4 x i32>* %3, align 4 783 ret i8* %x 784} 785 786define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) { 787; CHECK-LABEL: ldrbs32_128: 788; CHECK: @ %bb.0: @ %entry 789; CHECK-NEXT: vldrw.u32 q0, [r2] 790; CHECK-NEXT: add.w r3, r0, #128 791; CHECK-NEXT: vpt.i32 ne, q0, zr 792; CHECK-NEXT: vldrbt.s32 q0, [r3] 793; CHECK-NEXT: vstrw.32 q0, [r1] 794; CHECK-NEXT: bx lr 795entry: 796 %z = getelementptr inbounds i8, i8* %x, i32 128 797 %0 = bitcast i8* %z to <4 x i8>* 798 %mask = load <4 x i32>, <4 x i32>* %m, align 4 799 %c = icmp ne <4 x i32> %mask, zeroinitializer 800 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 801 %2 = sext <4 x i8> %1 to <4 x i32> 802 %3 = bitcast i8* %y to <4 x i32>* 803 store <4 x i32> %2, <4 x i32>* %3, align 4 804 ret i8* %x 805} 806 807define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) { 808; CHECK-LABEL: ldrbs32_m127: 809; CHECK: @ %bb.0: @ %entry 810; CHECK-NEXT: vldrw.u32 q0, [r2] 811; CHECK-NEXT: vpt.i32 ne, q0, zr 812; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127] 813; CHECK-NEXT: vstrw.32 q0, [r1] 814; CHECK-NEXT: bx lr 815entry: 816 %z = getelementptr inbounds i8, i8* %x, i32 -127 817 %0 = bitcast i8* %z to <4 x i8>* 818 %mask = load <4 x i32>, <4 x i32>* %m, align 4 819 %c = icmp ne <4 x i32> %mask, zeroinitializer 820 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 821 %2 = sext <4 x i8> %1 to <4 x i32> 822 %3 = bitcast i8* %y to <4 x i32>* 823 store <4 x i32> %2, <4 x i32>* %3, align 4 824 ret i8* %x 825} 826 827define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) { 828; CHECK-LABEL: ldrbs32_m128: 829; CHECK: @ %bb.0: @ %entry 830; CHECK-NEXT: vldrw.u32 q0, [r2] 831; CHECK-NEXT: sub.w r3, r0, #128 832; CHECK-NEXT: vpt.i32 ne, q0, zr 833; CHECK-NEXT: vldrbt.s32 q0, [r3] 834; CHECK-NEXT: vstrw.32 q0, [r1] 835; CHECK-NEXT: bx lr 836entry: 837 %z = getelementptr inbounds i8, i8* %x, i32 -128 838 %0 = bitcast i8* %z to <4 x i8>* 839 %mask = load <4 x i32>, <4 x i32>* %m, align 4 840 %c = icmp ne <4 x i32> %mask, zeroinitializer 841 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) 842 %2 = sext <4 x i8> %1 to <4 x i32> 843 %3 = bitcast i8* %y to <4 x i32>* 844 store <4 x i32> %2, <4 x i32>* %3, align 4 845 ret i8* %x 846} 847 848define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) { 849; CHECK-LABEL: ldrbu16_4: 850; CHECK: @ %bb.0: @ %entry 851; CHECK-NEXT: vldrh.u16 q0, [r2] 852; CHECK-NEXT: vpt.i16 ne, q0, zr 853; CHECK-NEXT: vldrbt.u16 q0, [r0, #4] 854; CHECK-NEXT: vstrh.16 q0, [r1] 855; CHECK-NEXT: bx lr 856entry: 857 %z = getelementptr inbounds i8, i8* %x, i32 4 858 %0 = bitcast i8* %z to <8 x i8>* 859 %mask = load <8 x i16>, <8 x i16>* %m, align 2 860 %c = icmp ne <8 x i16> %mask, zeroinitializer 861 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 862 %2 = zext <8 x i8> %1 to <8 x i16> 863 %3 = bitcast i8* %y to <8 x i16>* 864 store <8 x i16> %2, <8 x i16>* %3, align 2 865 ret i8* %x 866} 867 868define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) { 869; CHECK-LABEL: ldrbu16_3: 870; CHECK: @ %bb.0: @ %entry 871; CHECK-NEXT: vldrh.u16 q0, [r2] 872; CHECK-NEXT: vpt.i16 ne, q0, zr 873; CHECK-NEXT: vldrbt.u16 q0, [r0, #3] 874; CHECK-NEXT: vstrh.16 q0, [r1] 875; CHECK-NEXT: bx lr 876entry: 877 %z = getelementptr inbounds i8, i8* %x, i32 3 878 %0 = bitcast i8* %z to <8 x i8>* 879 %mask = load <8 x i16>, <8 x i16>* %m, align 2 880 %c = icmp ne <8 x i16> %mask, zeroinitializer 881 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 882 %2 = zext <8 x i8> %1 to <8 x i16> 883 %3 = bitcast i8* %y to <8 x i16>* 884 store <8 x i16> %2, <8 x i16>* %3, align 2 885 ret i8* %x 886} 887 888define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) { 889; CHECK-LABEL: ldrbu16_2: 890; CHECK: @ %bb.0: @ %entry 891; CHECK-NEXT: vldrh.u16 q0, [r2] 892; CHECK-NEXT: vpt.i16 ne, q0, zr 893; CHECK-NEXT: vldrbt.u16 q0, [r0, #2] 894; CHECK-NEXT: vstrh.16 q0, [r1] 895; CHECK-NEXT: bx lr 896entry: 897 %z = getelementptr inbounds i8, i8* %x, i32 2 898 %0 = bitcast i8* %z to <8 x i8>* 899 %mask = load <8 x i16>, <8 x i16>* %m, align 2 900 %c = icmp ne <8 x i16> %mask, zeroinitializer 901 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 902 %2 = zext <8 x i8> %1 to <8 x i16> 903 %3 = bitcast i8* %y to <8 x i16>* 904 store <8 x i16> %2, <8 x i16>* %3, align 2 905 ret i8* %x 906} 907 908define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) { 909; CHECK-LABEL: ldrbu16_127: 910; CHECK: @ %bb.0: @ %entry 911; CHECK-NEXT: vldrh.u16 q0, [r2] 912; CHECK-NEXT: vpt.i16 ne, q0, zr 913; CHECK-NEXT: vldrbt.u16 q0, [r0, #127] 914; CHECK-NEXT: vstrh.16 q0, [r1] 915; CHECK-NEXT: bx lr 916entry: 917 %z = getelementptr inbounds i8, i8* %x, i32 127 918 %0 = bitcast i8* %z to <8 x i8>* 919 %mask = load <8 x i16>, <8 x i16>* %m, align 2 920 %c = icmp ne <8 x i16> %mask, zeroinitializer 921 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 922 %2 = zext <8 x i8> %1 to <8 x i16> 923 %3 = bitcast i8* %y to <8 x i16>* 924 store <8 x i16> %2, <8 x i16>* %3, align 2 925 ret i8* %x 926} 927 928define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) { 929; CHECK-LABEL: ldrbu16_128: 930; CHECK: @ %bb.0: @ %entry 931; CHECK-NEXT: vldrh.u16 q0, [r2] 932; CHECK-NEXT: add.w r3, r0, #128 933; CHECK-NEXT: vpt.i16 ne, q0, zr 934; CHECK-NEXT: vldrbt.u16 q0, [r3] 935; CHECK-NEXT: vstrh.16 q0, [r1] 936; CHECK-NEXT: bx lr 937entry: 938 %z = getelementptr inbounds i8, i8* %x, i32 128 939 %0 = bitcast i8* %z to <8 x i8>* 940 %mask = load <8 x i16>, <8 x i16>* %m, align 2 941 %c = icmp ne <8 x i16> %mask, zeroinitializer 942 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 943 %2 = zext <8 x i8> %1 to <8 x i16> 944 %3 = bitcast i8* %y to <8 x i16>* 945 store <8 x i16> %2, <8 x i16>* %3, align 2 946 ret i8* %x 947} 948 949define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) { 950; CHECK-LABEL: ldrbu16_m127: 951; CHECK: @ %bb.0: @ %entry 952; CHECK-NEXT: vldrh.u16 q0, [r2] 953; CHECK-NEXT: vpt.i16 ne, q0, zr 954; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127] 955; CHECK-NEXT: vstrh.16 q0, [r1] 956; CHECK-NEXT: bx lr 957entry: 958 %z = getelementptr inbounds i8, i8* %x, i32 -127 959 %0 = bitcast i8* %z to <8 x i8>* 960 %mask = load <8 x i16>, <8 x i16>* %m, align 2 961 %c = icmp ne <8 x i16> %mask, zeroinitializer 962 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 963 %2 = zext <8 x i8> %1 to <8 x i16> 964 %3 = bitcast i8* %y to <8 x i16>* 965 store <8 x i16> %2, <8 x i16>* %3, align 2 966 ret i8* %x 967} 968 969define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) { 970; CHECK-LABEL: ldrbu16_m128: 971; CHECK: @ %bb.0: @ %entry 972; CHECK-NEXT: vldrh.u16 q0, [r2] 973; CHECK-NEXT: sub.w r3, r0, #128 974; CHECK-NEXT: vpt.i16 ne, q0, zr 975; CHECK-NEXT: vldrbt.u16 q0, [r3] 976; CHECK-NEXT: vstrh.16 q0, [r1] 977; CHECK-NEXT: bx lr 978entry: 979 %z = getelementptr inbounds i8, i8* %x, i32 -128 980 %0 = bitcast i8* %z to <8 x i8>* 981 %mask = load <8 x i16>, <8 x i16>* %m, align 2 982 %c = icmp ne <8 x i16> %mask, zeroinitializer 983 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 984 %2 = zext <8 x i8> %1 to <8 x i16> 985 %3 = bitcast i8* %y to <8 x i16>* 986 store <8 x i16> %2, <8 x i16>* %3, align 2 987 ret i8* %x 988} 989 990define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) { 991; CHECK-LABEL: ldrbs16_4: 992; CHECK: @ %bb.0: @ %entry 993; CHECK-NEXT: vldrh.u16 q0, [r2] 994; CHECK-NEXT: vpt.i16 ne, q0, zr 995; CHECK-NEXT: vldrbt.s16 q0, [r0, #4] 996; CHECK-NEXT: vstrh.16 q0, [r1] 997; CHECK-NEXT: bx lr 998entry: 999 %z = getelementptr inbounds i8, i8* %x, i32 4 1000 %0 = bitcast i8* %z to <8 x i8>* 1001 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1002 %c = icmp ne <8 x i16> %mask, zeroinitializer 1003 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 1004 %2 = sext <8 x i8> %1 to <8 x i16> 1005 %3 = bitcast i8* %y to <8 x i16>* 1006 store <8 x i16> %2, <8 x i16>* %3, align 2 1007 ret i8* %x 1008} 1009 1010define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) { 1011; CHECK-LABEL: ldrbs16_3: 1012; CHECK: @ %bb.0: @ %entry 1013; CHECK-NEXT: vldrh.u16 q0, [r2] 1014; CHECK-NEXT: vpt.i16 ne, q0, zr 1015; CHECK-NEXT: vldrbt.s16 q0, [r0, #3] 1016; CHECK-NEXT: vstrh.16 q0, [r1] 1017; CHECK-NEXT: bx lr 1018entry: 1019 %z = getelementptr inbounds i8, i8* %x, i32 3 1020 %0 = bitcast i8* %z to <8 x i8>* 1021 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1022 %c = icmp ne <8 x i16> %mask, zeroinitializer 1023 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 1024 %2 = sext <8 x i8> %1 to <8 x i16> 1025 %3 = bitcast i8* %y to <8 x i16>* 1026 store <8 x i16> %2, <8 x i16>* %3, align 2 1027 ret i8* %x 1028} 1029 1030define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) { 1031; CHECK-LABEL: ldrbs16_2: 1032; CHECK: @ %bb.0: @ %entry 1033; CHECK-NEXT: vldrh.u16 q0, [r2] 1034; CHECK-NEXT: vpt.i16 ne, q0, zr 1035; CHECK-NEXT: vldrbt.s16 q0, [r0, #2] 1036; CHECK-NEXT: vstrh.16 q0, [r1] 1037; CHECK-NEXT: bx lr 1038entry: 1039 %z = getelementptr inbounds i8, i8* %x, i32 2 1040 %0 = bitcast i8* %z to <8 x i8>* 1041 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1042 %c = icmp ne <8 x i16> %mask, zeroinitializer 1043 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 1044 %2 = sext <8 x i8> %1 to <8 x i16> 1045 %3 = bitcast i8* %y to <8 x i16>* 1046 store <8 x i16> %2, <8 x i16>* %3, align 2 1047 ret i8* %x 1048} 1049 1050define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) { 1051; CHECK-LABEL: ldrbs16_127: 1052; CHECK: @ %bb.0: @ %entry 1053; CHECK-NEXT: vldrh.u16 q0, [r2] 1054; CHECK-NEXT: vpt.i16 ne, q0, zr 1055; CHECK-NEXT: vldrbt.s16 q0, [r0, #127] 1056; CHECK-NEXT: vstrh.16 q0, [r1] 1057; CHECK-NEXT: bx lr 1058entry: 1059 %z = getelementptr inbounds i8, i8* %x, i32 127 1060 %0 = bitcast i8* %z to <8 x i8>* 1061 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1062 %c = icmp ne <8 x i16> %mask, zeroinitializer 1063 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 1064 %2 = sext <8 x i8> %1 to <8 x i16> 1065 %3 = bitcast i8* %y to <8 x i16>* 1066 store <8 x i16> %2, <8 x i16>* %3, align 2 1067 ret i8* %x 1068} 1069 1070define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) { 1071; CHECK-LABEL: ldrbs16_128: 1072; CHECK: @ %bb.0: @ %entry 1073; CHECK-NEXT: vldrh.u16 q0, [r2] 1074; CHECK-NEXT: add.w r3, r0, #128 1075; CHECK-NEXT: vpt.i16 ne, q0, zr 1076; CHECK-NEXT: vldrbt.s16 q0, [r3] 1077; CHECK-NEXT: vstrh.16 q0, [r1] 1078; CHECK-NEXT: bx lr 1079entry: 1080 %z = getelementptr inbounds i8, i8* %x, i32 128 1081 %0 = bitcast i8* %z to <8 x i8>* 1082 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1083 %c = icmp ne <8 x i16> %mask, zeroinitializer 1084 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 1085 %2 = sext <8 x i8> %1 to <8 x i16> 1086 %3 = bitcast i8* %y to <8 x i16>* 1087 store <8 x i16> %2, <8 x i16>* %3, align 2 1088 ret i8* %x 1089} 1090 1091define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) { 1092; CHECK-LABEL: ldrbs16_m127: 1093; CHECK: @ %bb.0: @ %entry 1094; CHECK-NEXT: vldrh.u16 q0, [r2] 1095; CHECK-NEXT: vpt.i16 ne, q0, zr 1096; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127] 1097; CHECK-NEXT: vstrh.16 q0, [r1] 1098; CHECK-NEXT: bx lr 1099entry: 1100 %z = getelementptr inbounds i8, i8* %x, i32 -127 1101 %0 = bitcast i8* %z to <8 x i8>* 1102 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1103 %c = icmp ne <8 x i16> %mask, zeroinitializer 1104 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 1105 %2 = sext <8 x i8> %1 to <8 x i16> 1106 %3 = bitcast i8* %y to <8 x i16>* 1107 store <8 x i16> %2, <8 x i16>* %3, align 2 1108 ret i8* %x 1109} 1110 1111define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) { 1112; CHECK-LABEL: ldrbs16_m128: 1113; CHECK: @ %bb.0: @ %entry 1114; CHECK-NEXT: vldrh.u16 q0, [r2] 1115; CHECK-NEXT: sub.w r3, r0, #128 1116; CHECK-NEXT: vpt.i16 ne, q0, zr 1117; CHECK-NEXT: vldrbt.s16 q0, [r3] 1118; CHECK-NEXT: vstrh.16 q0, [r1] 1119; CHECK-NEXT: bx lr 1120entry: 1121 %z = getelementptr inbounds i8, i8* %x, i32 -128 1122 %0 = bitcast i8* %z to <8 x i8>* 1123 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1124 %c = icmp ne <8 x i16> %mask, zeroinitializer 1125 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) 1126 %2 = sext <8 x i8> %1 to <8 x i16> 1127 %3 = bitcast i8* %y to <8 x i16>* 1128 store <8 x i16> %2, <8 x i16>* %3, align 2 1129 ret i8* %x 1130} 1131 1132define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) { 1133; CHECK-LABEL: ldrbu8_4: 1134; CHECK: @ %bb.0: @ %entry 1135; CHECK-NEXT: vldrb.u8 q0, [r2] 1136; CHECK-NEXT: vpt.i8 ne, q0, zr 1137; CHECK-NEXT: vldrbt.u8 q0, [r0, #4] 1138; CHECK-NEXT: vstrb.8 q0, [r1] 1139; CHECK-NEXT: bx lr 1140entry: 1141 %z = getelementptr inbounds i8, i8* %x, i32 4 1142 %0 = bitcast i8* %z to <16 x i8>* 1143 %mask = load <16 x i8>, <16 x i8>* %m, align 1 1144 %c = icmp ne <16 x i8> %mask, zeroinitializer 1145 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) 1146 %2 = bitcast i8* %y to <16 x i8>* 1147 store <16 x i8> %1, <16 x i8>* %2, align 1 1148 ret i8* %x 1149} 1150 1151define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) { 1152; CHECK-LABEL: ldrbu8_3: 1153; CHECK: @ %bb.0: @ %entry 1154; CHECK-NEXT: vldrb.u8 q0, [r2] 1155; CHECK-NEXT: vpt.i8 ne, q0, zr 1156; CHECK-NEXT: vldrbt.u8 q0, [r0, #3] 1157; CHECK-NEXT: vstrb.8 q0, [r1] 1158; CHECK-NEXT: bx lr 1159entry: 1160 %z = getelementptr inbounds i8, i8* %x, i32 3 1161 %0 = bitcast i8* %z to <16 x i8>* 1162 %mask = load <16 x i8>, <16 x i8>* %m, align 1 1163 %c = icmp ne <16 x i8> %mask, zeroinitializer 1164 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) 1165 %2 = bitcast i8* %y to <16 x i8>* 1166 store <16 x i8> %1, <16 x i8>* %2, align 1 1167 ret i8* %x 1168} 1169 1170define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) { 1171; CHECK-LABEL: ldrbu8_2: 1172; CHECK: @ %bb.0: @ %entry 1173; CHECK-NEXT: vldrb.u8 q0, [r2] 1174; CHECK-NEXT: vpt.i8 ne, q0, zr 1175; CHECK-NEXT: vldrbt.u8 q0, [r0, #2] 1176; CHECK-NEXT: vstrb.8 q0, [r1] 1177; CHECK-NEXT: bx lr 1178entry: 1179 %z = getelementptr inbounds i8, i8* %x, i32 2 1180 %0 = bitcast i8* %z to <16 x i8>* 1181 %mask = load <16 x i8>, <16 x i8>* %m, align 1 1182 %c = icmp ne <16 x i8> %mask, zeroinitializer 1183 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) 1184 %2 = bitcast i8* %y to <16 x i8>* 1185 store <16 x i8> %1, <16 x i8>* %2, align 1 1186 ret i8* %x 1187} 1188 1189define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) { 1190; CHECK-LABEL: ldrbu8_127: 1191; CHECK: @ %bb.0: @ %entry 1192; CHECK-NEXT: vldrb.u8 q0, [r2] 1193; CHECK-NEXT: vpt.i8 ne, q0, zr 1194; CHECK-NEXT: vldrbt.u8 q0, [r0, #127] 1195; CHECK-NEXT: vstrb.8 q0, [r1] 1196; CHECK-NEXT: bx lr 1197entry: 1198 %z = getelementptr inbounds i8, i8* %x, i32 127 1199 %0 = bitcast i8* %z to <16 x i8>* 1200 %mask = load <16 x i8>, <16 x i8>* %m, align 1 1201 %c = icmp ne <16 x i8> %mask, zeroinitializer 1202 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) 1203 %2 = bitcast i8* %y to <16 x i8>* 1204 store <16 x i8> %1, <16 x i8>* %2, align 1 1205 ret i8* %x 1206} 1207 1208define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) { 1209; CHECK-LABEL: ldrbu8_128: 1210; CHECK: @ %bb.0: @ %entry 1211; CHECK-NEXT: vldrb.u8 q0, [r2] 1212; CHECK-NEXT: add.w r3, r0, #128 1213; CHECK-NEXT: vpt.i8 ne, q0, zr 1214; CHECK-NEXT: vldrbt.u8 q0, [r3] 1215; CHECK-NEXT: vstrb.8 q0, [r1] 1216; CHECK-NEXT: bx lr 1217entry: 1218 %z = getelementptr inbounds i8, i8* %x, i32 128 1219 %0 = bitcast i8* %z to <16 x i8>* 1220 %mask = load <16 x i8>, <16 x i8>* %m, align 1 1221 %c = icmp ne <16 x i8> %mask, zeroinitializer 1222 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) 1223 %2 = bitcast i8* %y to <16 x i8>* 1224 store <16 x i8> %1, <16 x i8>* %2, align 1 1225 ret i8* %x 1226} 1227 1228define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) { 1229; CHECK-LABEL: ldrbu8_m127: 1230; CHECK: @ %bb.0: @ %entry 1231; CHECK-NEXT: vldrb.u8 q0, [r2] 1232; CHECK-NEXT: vpt.i8 ne, q0, zr 1233; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127] 1234; CHECK-NEXT: vstrb.8 q0, [r1] 1235; CHECK-NEXT: bx lr 1236entry: 1237 %z = getelementptr inbounds i8, i8* %x, i32 -127 1238 %0 = bitcast i8* %z to <16 x i8>* 1239 %mask = load <16 x i8>, <16 x i8>* %m, align 1 1240 %c = icmp ne <16 x i8> %mask, zeroinitializer 1241 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) 1242 %2 = bitcast i8* %y to <16 x i8>* 1243 store <16 x i8> %1, <16 x i8>* %2, align 1 1244 ret i8* %x 1245} 1246 1247define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) { 1248; CHECK-LABEL: ldrbu8_m128: 1249; CHECK: @ %bb.0: @ %entry 1250; CHECK-NEXT: vldrb.u8 q0, [r2] 1251; CHECK-NEXT: sub.w r3, r0, #128 1252; CHECK-NEXT: vpt.i8 ne, q0, zr 1253; CHECK-NEXT: vldrbt.u8 q0, [r3] 1254; CHECK-NEXT: vstrb.8 q0, [r1] 1255; CHECK-NEXT: bx lr 1256entry: 1257 %z = getelementptr inbounds i8, i8* %x, i32 -128 1258 %0 = bitcast i8* %z to <16 x i8>* 1259 %mask = load <16 x i8>, <16 x i8>* %m, align 1 1260 %c = icmp ne <16 x i8> %mask, zeroinitializer 1261 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) 1262 %2 = bitcast i8* %y to <16 x i8>* 1263 store <16 x i8> %1, <16 x i8>* %2, align 1 1264 ret i8* %x 1265} 1266 1267define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) { 1268; CHECK-LABEL: ldrwf32_4: 1269; CHECK: @ %bb.0: @ %entry 1270; CHECK-NEXT: vldrw.u32 q0, [r2] 1271; CHECK-NEXT: vpt.i32 ne, q0, zr 1272; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] 1273; CHECK-NEXT: vstrw.32 q0, [r1] 1274; CHECK-NEXT: bx lr 1275entry: 1276 %z = getelementptr inbounds i8, i8* %x, i32 4 1277 %0 = bitcast i8* %z to <4 x float>* 1278 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1279 %c = icmp ne <4 x i32> %mask, zeroinitializer 1280 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) 1281 %2 = bitcast i8* %y to <4 x float>* 1282 store <4 x float> %1, <4 x float>* %2, align 4 1283 ret i8* %x 1284} 1285 1286define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) { 1287; CHECK-LABEL: ldrwf32_3: 1288; CHECK: @ %bb.0: @ %entry 1289; CHECK-NEXT: vldrw.u32 q0, [r2] 1290; CHECK-NEXT: adds r3, r0, #3 1291; CHECK-NEXT: vpt.i32 ne, q0, zr 1292; CHECK-NEXT: vldrwt.u32 q0, [r3] 1293; CHECK-NEXT: vstrw.32 q0, [r1] 1294; CHECK-NEXT: bx lr 1295entry: 1296 %z = getelementptr inbounds i8, i8* %x, i32 3 1297 %0 = bitcast i8* %z to <4 x float>* 1298 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1299 %c = icmp ne <4 x i32> %mask, zeroinitializer 1300 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) 1301 %2 = bitcast i8* %y to <4 x float>* 1302 store <4 x float> %1, <4 x float>* %2, align 4 1303 ret i8* %x 1304} 1305 1306define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) { 1307; CHECK-LABEL: ldrwf32_2: 1308; CHECK: @ %bb.0: @ %entry 1309; CHECK-NEXT: vldrw.u32 q0, [r2] 1310; CHECK-NEXT: adds r3, r0, #2 1311; CHECK-NEXT: vpt.i32 ne, q0, zr 1312; CHECK-NEXT: vldrwt.u32 q0, [r3] 1313; CHECK-NEXT: vstrw.32 q0, [r1] 1314; CHECK-NEXT: bx lr 1315entry: 1316 %z = getelementptr inbounds i8, i8* %x, i32 2 1317 %0 = bitcast i8* %z to <4 x float>* 1318 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1319 %c = icmp ne <4 x i32> %mask, zeroinitializer 1320 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) 1321 %2 = bitcast i8* %y to <4 x float>* 1322 store <4 x float> %1, <4 x float>* %2, align 4 1323 ret i8* %x 1324} 1325 1326define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) { 1327; CHECK-LABEL: ldrwf32_508: 1328; CHECK: @ %bb.0: @ %entry 1329; CHECK-NEXT: vldrw.u32 q0, [r2] 1330; CHECK-NEXT: vpt.i32 ne, q0, zr 1331; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] 1332; CHECK-NEXT: vstrw.32 q0, [r1] 1333; CHECK-NEXT: bx lr 1334entry: 1335 %z = getelementptr inbounds i8, i8* %x, i32 508 1336 %0 = bitcast i8* %z to <4 x float>* 1337 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1338 %c = icmp ne <4 x i32> %mask, zeroinitializer 1339 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) 1340 %2 = bitcast i8* %y to <4 x float>* 1341 store <4 x float> %1, <4 x float>* %2, align 4 1342 ret i8* %x 1343} 1344 1345define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) { 1346; CHECK-LABEL: ldrwf32_512: 1347; CHECK: @ %bb.0: @ %entry 1348; CHECK-NEXT: vldrw.u32 q0, [r2] 1349; CHECK-NEXT: add.w r3, r0, #512 1350; CHECK-NEXT: vpt.i32 ne, q0, zr 1351; CHECK-NEXT: vldrwt.u32 q0, [r3] 1352; CHECK-NEXT: vstrw.32 q0, [r1] 1353; CHECK-NEXT: bx lr 1354entry: 1355 %z = getelementptr inbounds i8, i8* %x, i32 512 1356 %0 = bitcast i8* %z to <4 x float>* 1357 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1358 %c = icmp ne <4 x i32> %mask, zeroinitializer 1359 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) 1360 %2 = bitcast i8* %y to <4 x float>* 1361 store <4 x float> %1, <4 x float>* %2, align 4 1362 ret i8* %x 1363} 1364 1365define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) { 1366; CHECK-LABEL: ldrwf32_m508: 1367; CHECK: @ %bb.0: @ %entry 1368; CHECK-NEXT: vldrw.u32 q0, [r2] 1369; CHECK-NEXT: vpt.i32 ne, q0, zr 1370; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] 1371; CHECK-NEXT: vstrw.32 q0, [r1] 1372; CHECK-NEXT: bx lr 1373entry: 1374 %z = getelementptr inbounds i8, i8* %x, i32 -508 1375 %0 = bitcast i8* %z to <4 x float>* 1376 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1377 %c = icmp ne <4 x i32> %mask, zeroinitializer 1378 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) 1379 %2 = bitcast i8* %y to <4 x float>* 1380 store <4 x float> %1, <4 x float>* %2, align 4 1381 ret i8* %x 1382} 1383 1384define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) { 1385; CHECK-LABEL: ldrwf32_m512: 1386; CHECK: @ %bb.0: @ %entry 1387; CHECK-NEXT: vldrw.u32 q0, [r2] 1388; CHECK-NEXT: sub.w r3, r0, #512 1389; CHECK-NEXT: vpt.i32 ne, q0, zr 1390; CHECK-NEXT: vldrwt.u32 q0, [r3] 1391; CHECK-NEXT: vstrw.32 q0, [r1] 1392; CHECK-NEXT: bx lr 1393entry: 1394 %z = getelementptr inbounds i8, i8* %x, i32 -512 1395 %0 = bitcast i8* %z to <4 x float>* 1396 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1397 %c = icmp ne <4 x i32> %mask, zeroinitializer 1398 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) 1399 %2 = bitcast i8* %y to <4 x float>* 1400 store <4 x float> %1, <4 x float>* %2, align 4 1401 ret i8* %x 1402} 1403 1404define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) { 1405; CHECK-LABEL: ldrhf16_4: 1406; CHECK: @ %bb.0: @ %entry 1407; CHECK-NEXT: vldrh.u16 q0, [r2] 1408; CHECK-NEXT: vpt.i16 ne, q0, zr 1409; CHECK-NEXT: vldrht.u16 q0, [r0, #4] 1410; CHECK-NEXT: vstrh.16 q0, [r1] 1411; CHECK-NEXT: bx lr 1412entry: 1413 %z = getelementptr inbounds i8, i8* %x, i32 4 1414 %0 = bitcast i8* %z to <8 x half>* 1415 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1416 %c = icmp ne <8 x i16> %mask, zeroinitializer 1417 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) 1418 %2 = bitcast i8* %y to <8 x half>* 1419 store <8 x half> %1, <8 x half>* %2, align 2 1420 ret i8* %x 1421} 1422 1423define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) { 1424; CHECK-LABEL: ldrhf16_3: 1425; CHECK: @ %bb.0: @ %entry 1426; CHECK-NEXT: vldrh.u16 q0, [r2] 1427; CHECK-NEXT: adds r3, r0, #3 1428; CHECK-NEXT: vpt.i16 ne, q0, zr 1429; CHECK-NEXT: vldrht.u16 q0, [r3] 1430; CHECK-NEXT: vstrh.16 q0, [r1] 1431; CHECK-NEXT: bx lr 1432entry: 1433 %z = getelementptr inbounds i8, i8* %x, i32 3 1434 %0 = bitcast i8* %z to <8 x half>* 1435 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1436 %c = icmp ne <8 x i16> %mask, zeroinitializer 1437 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) 1438 %2 = bitcast i8* %y to <8 x half>* 1439 store <8 x half> %1, <8 x half>* %2, align 2 1440 ret i8* %x 1441} 1442 1443define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) { 1444; CHECK-LABEL: ldrhf16_2: 1445; CHECK: @ %bb.0: @ %entry 1446; CHECK-NEXT: vldrh.u16 q0, [r2] 1447; CHECK-NEXT: vpt.i16 ne, q0, zr 1448; CHECK-NEXT: vldrht.u16 q0, [r0, #2] 1449; CHECK-NEXT: vstrh.16 q0, [r1] 1450; CHECK-NEXT: bx lr 1451entry: 1452 %z = getelementptr inbounds i8, i8* %x, i32 2 1453 %0 = bitcast i8* %z to <8 x half>* 1454 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1455 %c = icmp ne <8 x i16> %mask, zeroinitializer 1456 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) 1457 %2 = bitcast i8* %y to <8 x half>* 1458 store <8 x half> %1, <8 x half>* %2, align 2 1459 ret i8* %x 1460} 1461 1462define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) { 1463; CHECK-LABEL: ldrhf16_254: 1464; CHECK: @ %bb.0: @ %entry 1465; CHECK-NEXT: vldrh.u16 q0, [r2] 1466; CHECK-NEXT: vpt.i16 ne, q0, zr 1467; CHECK-NEXT: vldrht.u16 q0, [r0, #254] 1468; CHECK-NEXT: vstrh.16 q0, [r1] 1469; CHECK-NEXT: bx lr 1470entry: 1471 %z = getelementptr inbounds i8, i8* %x, i32 254 1472 %0 = bitcast i8* %z to <8 x half>* 1473 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1474 %c = icmp ne <8 x i16> %mask, zeroinitializer 1475 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) 1476 %2 = bitcast i8* %y to <8 x half>* 1477 store <8 x half> %1, <8 x half>* %2, align 2 1478 ret i8* %x 1479} 1480 1481define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) { 1482; CHECK-LABEL: ldrhf16_256: 1483; CHECK: @ %bb.0: @ %entry 1484; CHECK-NEXT: vldrh.u16 q0, [r2] 1485; CHECK-NEXT: add.w r3, r0, #256 1486; CHECK-NEXT: vpt.i16 ne, q0, zr 1487; CHECK-NEXT: vldrht.u16 q0, [r3] 1488; CHECK-NEXT: vstrh.16 q0, [r1] 1489; CHECK-NEXT: bx lr 1490entry: 1491 %z = getelementptr inbounds i8, i8* %x, i32 256 1492 %0 = bitcast i8* %z to <8 x half>* 1493 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1494 %c = icmp ne <8 x i16> %mask, zeroinitializer 1495 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) 1496 %2 = bitcast i8* %y to <8 x half>* 1497 store <8 x half> %1, <8 x half>* %2, align 2 1498 ret i8* %x 1499} 1500 1501define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) { 1502; CHECK-LABEL: ldrhf16_m254: 1503; CHECK: @ %bb.0: @ %entry 1504; CHECK-NEXT: vldrh.u16 q0, [r2] 1505; CHECK-NEXT: vpt.i16 ne, q0, zr 1506; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] 1507; CHECK-NEXT: vstrh.16 q0, [r1] 1508; CHECK-NEXT: bx lr 1509entry: 1510 %z = getelementptr inbounds i8, i8* %x, i32 -254 1511 %0 = bitcast i8* %z to <8 x half>* 1512 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1513 %c = icmp ne <8 x i16> %mask, zeroinitializer 1514 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) 1515 %2 = bitcast i8* %y to <8 x half>* 1516 store <8 x half> %1, <8 x half>* %2, align 2 1517 ret i8* %x 1518} 1519 1520define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) { 1521; CHECK-LABEL: ldrhf16_m256: 1522; CHECK: @ %bb.0: @ %entry 1523; CHECK-NEXT: vldrh.u16 q0, [r2] 1524; CHECK-NEXT: sub.w r3, r0, #256 1525; CHECK-NEXT: vpt.i16 ne, q0, zr 1526; CHECK-NEXT: vldrht.u16 q0, [r3] 1527; CHECK-NEXT: vstrh.16 q0, [r1] 1528; CHECK-NEXT: bx lr 1529entry: 1530 %z = getelementptr inbounds i8, i8* %x, i32 -256 1531 %0 = bitcast i8* %z to <8 x half>* 1532 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1533 %c = icmp ne <8 x i16> %mask, zeroinitializer 1534 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) 1535 %2 = bitcast i8* %y to <8 x half>* 1536 store <8 x half> %1, <8 x half>* %2, align 2 1537 ret i8* %x 1538} 1539 1540 1541 1542 1543define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) { 1544; CHECK-LABEL: strw32_4: 1545; CHECK: @ %bb.0: @ %entry 1546; CHECK-NEXT: vldrw.u32 q0, [r1] 1547; CHECK-NEXT: vldrw.u32 q1, [r2] 1548; CHECK-NEXT: vpt.i32 ne, q1, zr 1549; CHECK-NEXT: vstrwt.32 q0, [r0, #4] 1550; CHECK-NEXT: bx lr 1551entry: 1552 %z = getelementptr inbounds i8, i8* %y, i32 4 1553 %0 = bitcast i8* %x to <4 x i32>* 1554 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1555 %c = icmp ne <4 x i32> %mask, zeroinitializer 1556 %1 = load <4 x i32>, <4 x i32>* %0, align 4 1557 %2 = bitcast i8* %z to <4 x i32>* 1558 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) 1559 ret i8* %y 1560} 1561 1562define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) { 1563; CHECK-LABEL: strw32_3: 1564; CHECK: @ %bb.0: @ %entry 1565; CHECK-NEXT: vldrw.u32 q0, [r1] 1566; CHECK-NEXT: adds r1, r0, #3 1567; CHECK-NEXT: vldrw.u32 q1, [r2] 1568; CHECK-NEXT: vpt.i32 ne, q1, zr 1569; CHECK-NEXT: vstrwt.32 q0, [r1] 1570; CHECK-NEXT: bx lr 1571entry: 1572 %z = getelementptr inbounds i8, i8* %y, i32 3 1573 %0 = bitcast i8* %x to <4 x i32>* 1574 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1575 %c = icmp ne <4 x i32> %mask, zeroinitializer 1576 %1 = load <4 x i32>, <4 x i32>* %0, align 4 1577 %2 = bitcast i8* %z to <4 x i32>* 1578 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) 1579 ret i8* %y 1580} 1581 1582define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) { 1583; CHECK-LABEL: strw32_2: 1584; CHECK: @ %bb.0: @ %entry 1585; CHECK-NEXT: vldrw.u32 q0, [r1] 1586; CHECK-NEXT: adds r1, r0, #2 1587; CHECK-NEXT: vldrw.u32 q1, [r2] 1588; CHECK-NEXT: vpt.i32 ne, q1, zr 1589; CHECK-NEXT: vstrwt.32 q0, [r1] 1590; CHECK-NEXT: bx lr 1591entry: 1592 %z = getelementptr inbounds i8, i8* %y, i32 2 1593 %0 = bitcast i8* %x to <4 x i32>* 1594 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1595 %c = icmp ne <4 x i32> %mask, zeroinitializer 1596 %1 = load <4 x i32>, <4 x i32>* %0, align 4 1597 %2 = bitcast i8* %z to <4 x i32>* 1598 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) 1599 ret i8* %y 1600} 1601 1602define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) { 1603; CHECK-LABEL: strw32_508: 1604; CHECK: @ %bb.0: @ %entry 1605; CHECK-NEXT: vldrw.u32 q0, [r1] 1606; CHECK-NEXT: vldrw.u32 q1, [r2] 1607; CHECK-NEXT: vpt.i32 ne, q1, zr 1608; CHECK-NEXT: vstrwt.32 q0, [r0, #508] 1609; CHECK-NEXT: bx lr 1610entry: 1611 %z = getelementptr inbounds i8, i8* %y, i32 508 1612 %0 = bitcast i8* %x to <4 x i32>* 1613 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1614 %c = icmp ne <4 x i32> %mask, zeroinitializer 1615 %1 = load <4 x i32>, <4 x i32>* %0, align 4 1616 %2 = bitcast i8* %z to <4 x i32>* 1617 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) 1618 ret i8* %y 1619} 1620 1621define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) { 1622; CHECK-LABEL: strw32_512: 1623; CHECK: @ %bb.0: @ %entry 1624; CHECK-NEXT: vldrw.u32 q0, [r1] 1625; CHECK-NEXT: add.w r1, r0, #512 1626; CHECK-NEXT: vldrw.u32 q1, [r2] 1627; CHECK-NEXT: vpt.i32 ne, q1, zr 1628; CHECK-NEXT: vstrwt.32 q0, [r1] 1629; CHECK-NEXT: bx lr 1630entry: 1631 %z = getelementptr inbounds i8, i8* %y, i32 512 1632 %0 = bitcast i8* %x to <4 x i32>* 1633 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1634 %c = icmp ne <4 x i32> %mask, zeroinitializer 1635 %1 = load <4 x i32>, <4 x i32>* %0, align 4 1636 %2 = bitcast i8* %z to <4 x i32>* 1637 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) 1638 ret i8* %y 1639} 1640 1641define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) { 1642; CHECK-LABEL: strw32_m508: 1643; CHECK: @ %bb.0: @ %entry 1644; CHECK-NEXT: vldrw.u32 q0, [r1] 1645; CHECK-NEXT: vldrw.u32 q1, [r2] 1646; CHECK-NEXT: vpt.i32 ne, q1, zr 1647; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] 1648; CHECK-NEXT: bx lr 1649entry: 1650 %z = getelementptr inbounds i8, i8* %y, i32 -508 1651 %0 = bitcast i8* %x to <4 x i32>* 1652 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1653 %c = icmp ne <4 x i32> %mask, zeroinitializer 1654 %1 = load <4 x i32>, <4 x i32>* %0, align 4 1655 %2 = bitcast i8* %z to <4 x i32>* 1656 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) 1657 ret i8* %y 1658} 1659 1660define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) { 1661; CHECK-LABEL: strw32_m512: 1662; CHECK: @ %bb.0: @ %entry 1663; CHECK-NEXT: vldrw.u32 q0, [r1] 1664; CHECK-NEXT: sub.w r1, r0, #512 1665; CHECK-NEXT: vldrw.u32 q1, [r2] 1666; CHECK-NEXT: vpt.i32 ne, q1, zr 1667; CHECK-NEXT: vstrwt.32 q0, [r1] 1668; CHECK-NEXT: bx lr 1669entry: 1670 %z = getelementptr inbounds i8, i8* %y, i32 -512 1671 %0 = bitcast i8* %x to <4 x i32>* 1672 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1673 %c = icmp ne <4 x i32> %mask, zeroinitializer 1674 %1 = load <4 x i32>, <4 x i32>* %0, align 4 1675 %2 = bitcast i8* %z to <4 x i32>* 1676 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) 1677 ret i8* %y 1678} 1679 1680define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) { 1681; CHECK-LABEL: strh32_4: 1682; CHECK: @ %bb.0: @ %entry 1683; CHECK-NEXT: vldrh.u32 q0, [r1] 1684; CHECK-NEXT: vldrw.u32 q1, [r2] 1685; CHECK-NEXT: vpt.i32 ne, q1, zr 1686; CHECK-NEXT: vstrht.32 q0, [r0, #4] 1687; CHECK-NEXT: bx lr 1688entry: 1689 %z = getelementptr inbounds i8, i8* %y, i32 4 1690 %0 = bitcast i8* %x to <4 x i16>* 1691 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1692 %c = icmp ne <4 x i32> %mask, zeroinitializer 1693 %1 = load <4 x i16>, <4 x i16>* %0, align 2 1694 %2 = bitcast i8* %z to <4 x i16>* 1695 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) 1696 ret i8* %y 1697} 1698 1699define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) { 1700; CHECK-LABEL: strh32_3: 1701; CHECK: @ %bb.0: @ %entry 1702; CHECK-NEXT: vldrh.u32 q0, [r1] 1703; CHECK-NEXT: adds r1, r0, #3 1704; CHECK-NEXT: vldrw.u32 q1, [r2] 1705; CHECK-NEXT: vpt.i32 ne, q1, zr 1706; CHECK-NEXT: vstrht.32 q0, [r1] 1707; CHECK-NEXT: bx lr 1708entry: 1709 %z = getelementptr inbounds i8, i8* %y, i32 3 1710 %0 = bitcast i8* %x to <4 x i16>* 1711 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1712 %c = icmp ne <4 x i32> %mask, zeroinitializer 1713 %1 = load <4 x i16>, <4 x i16>* %0, align 2 1714 %2 = bitcast i8* %z to <4 x i16>* 1715 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) 1716 ret i8* %y 1717} 1718 1719define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) { 1720; CHECK-LABEL: strh32_2: 1721; CHECK: @ %bb.0: @ %entry 1722; CHECK-NEXT: vldrh.u32 q0, [r1] 1723; CHECK-NEXT: vldrw.u32 q1, [r2] 1724; CHECK-NEXT: vpt.i32 ne, q1, zr 1725; CHECK-NEXT: vstrht.32 q0, [r0, #2] 1726; CHECK-NEXT: bx lr 1727entry: 1728 %z = getelementptr inbounds i8, i8* %y, i32 2 1729 %0 = bitcast i8* %x to <4 x i16>* 1730 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1731 %c = icmp ne <4 x i32> %mask, zeroinitializer 1732 %1 = load <4 x i16>, <4 x i16>* %0, align 2 1733 %2 = bitcast i8* %z to <4 x i16>* 1734 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) 1735 ret i8* %y 1736} 1737 1738define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) { 1739; CHECK-LABEL: strh32_254: 1740; CHECK: @ %bb.0: @ %entry 1741; CHECK-NEXT: vldrh.u32 q0, [r1] 1742; CHECK-NEXT: vldrw.u32 q1, [r2] 1743; CHECK-NEXT: vpt.i32 ne, q1, zr 1744; CHECK-NEXT: vstrht.32 q0, [r0, #254] 1745; CHECK-NEXT: bx lr 1746entry: 1747 %z = getelementptr inbounds i8, i8* %y, i32 254 1748 %0 = bitcast i8* %x to <4 x i16>* 1749 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1750 %c = icmp ne <4 x i32> %mask, zeroinitializer 1751 %1 = load <4 x i16>, <4 x i16>* %0, align 2 1752 %2 = bitcast i8* %z to <4 x i16>* 1753 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) 1754 ret i8* %y 1755} 1756 1757define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) { 1758; CHECK-LABEL: strh32_256: 1759; CHECK: @ %bb.0: @ %entry 1760; CHECK-NEXT: vldrh.u32 q0, [r1] 1761; CHECK-NEXT: add.w r1, r0, #256 1762; CHECK-NEXT: vldrw.u32 q1, [r2] 1763; CHECK-NEXT: vpt.i32 ne, q1, zr 1764; CHECK-NEXT: vstrht.32 q0, [r1] 1765; CHECK-NEXT: bx lr 1766entry: 1767 %z = getelementptr inbounds i8, i8* %y, i32 256 1768 %0 = bitcast i8* %x to <4 x i16>* 1769 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1770 %c = icmp ne <4 x i32> %mask, zeroinitializer 1771 %1 = load <4 x i16>, <4 x i16>* %0, align 2 1772 %2 = bitcast i8* %z to <4 x i16>* 1773 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) 1774 ret i8* %y 1775} 1776 1777define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) { 1778; CHECK-LABEL: strh32_m254: 1779; CHECK: @ %bb.0: @ %entry 1780; CHECK-NEXT: vldrh.u32 q0, [r1] 1781; CHECK-NEXT: vldrw.u32 q1, [r2] 1782; CHECK-NEXT: vpt.i32 ne, q1, zr 1783; CHECK-NEXT: vstrht.32 q0, [r0, #-254] 1784; CHECK-NEXT: bx lr 1785entry: 1786 %z = getelementptr inbounds i8, i8* %y, i32 -254 1787 %0 = bitcast i8* %x to <4 x i16>* 1788 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1789 %c = icmp ne <4 x i32> %mask, zeroinitializer 1790 %1 = load <4 x i16>, <4 x i16>* %0, align 2 1791 %2 = bitcast i8* %z to <4 x i16>* 1792 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) 1793 ret i8* %y 1794} 1795 1796define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) { 1797; CHECK-LABEL: strh32_m256: 1798; CHECK: @ %bb.0: @ %entry 1799; CHECK-NEXT: vldrh.u32 q0, [r1] 1800; CHECK-NEXT: sub.w r1, r0, #256 1801; CHECK-NEXT: vldrw.u32 q1, [r2] 1802; CHECK-NEXT: vpt.i32 ne, q1, zr 1803; CHECK-NEXT: vstrht.32 q0, [r1] 1804; CHECK-NEXT: bx lr 1805entry: 1806 %z = getelementptr inbounds i8, i8* %y, i32 -256 1807 %0 = bitcast i8* %x to <4 x i16>* 1808 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1809 %c = icmp ne <4 x i32> %mask, zeroinitializer 1810 %1 = load <4 x i16>, <4 x i16>* %0, align 2 1811 %2 = bitcast i8* %z to <4 x i16>* 1812 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) 1813 ret i8* %y 1814} 1815 1816define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) { 1817; CHECK-LABEL: strh16_4: 1818; CHECK: @ %bb.0: @ %entry 1819; CHECK-NEXT: vldrh.u16 q0, [r1] 1820; CHECK-NEXT: vldrh.u16 q1, [r2] 1821; CHECK-NEXT: vpt.i16 ne, q1, zr 1822; CHECK-NEXT: vstrht.16 q0, [r0, #4] 1823; CHECK-NEXT: bx lr 1824entry: 1825 %z = getelementptr inbounds i8, i8* %y, i32 4 1826 %0 = bitcast i8* %x to <8 x i16>* 1827 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1828 %c = icmp ne <8 x i16> %mask, zeroinitializer 1829 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1830 %2 = bitcast i8* %z to <8 x i16>* 1831 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) 1832 ret i8* %y 1833} 1834 1835define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) { 1836; CHECK-LABEL: strh16_3: 1837; CHECK: @ %bb.0: @ %entry 1838; CHECK-NEXT: vldrh.u16 q0, [r1] 1839; CHECK-NEXT: adds r1, r0, #3 1840; CHECK-NEXT: vldrh.u16 q1, [r2] 1841; CHECK-NEXT: vpt.i16 ne, q1, zr 1842; CHECK-NEXT: vstrht.16 q0, [r1] 1843; CHECK-NEXT: bx lr 1844entry: 1845 %z = getelementptr inbounds i8, i8* %y, i32 3 1846 %0 = bitcast i8* %x to <8 x i16>* 1847 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1848 %c = icmp ne <8 x i16> %mask, zeroinitializer 1849 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1850 %2 = bitcast i8* %z to <8 x i16>* 1851 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) 1852 ret i8* %y 1853} 1854 1855define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) { 1856; CHECK-LABEL: strh16_2: 1857; CHECK: @ %bb.0: @ %entry 1858; CHECK-NEXT: vldrh.u16 q0, [r1] 1859; CHECK-NEXT: vldrh.u16 q1, [r2] 1860; CHECK-NEXT: vpt.i16 ne, q1, zr 1861; CHECK-NEXT: vstrht.16 q0, [r0, #2] 1862; CHECK-NEXT: bx lr 1863entry: 1864 %z = getelementptr inbounds i8, i8* %y, i32 2 1865 %0 = bitcast i8* %x to <8 x i16>* 1866 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1867 %c = icmp ne <8 x i16> %mask, zeroinitializer 1868 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1869 %2 = bitcast i8* %z to <8 x i16>* 1870 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) 1871 ret i8* %y 1872} 1873 1874define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) { 1875; CHECK-LABEL: strh16_254: 1876; CHECK: @ %bb.0: @ %entry 1877; CHECK-NEXT: vldrh.u16 q0, [r1] 1878; CHECK-NEXT: vldrh.u16 q1, [r2] 1879; CHECK-NEXT: vpt.i16 ne, q1, zr 1880; CHECK-NEXT: vstrht.16 q0, [r0, #254] 1881; CHECK-NEXT: bx lr 1882entry: 1883 %z = getelementptr inbounds i8, i8* %y, i32 254 1884 %0 = bitcast i8* %x to <8 x i16>* 1885 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1886 %c = icmp ne <8 x i16> %mask, zeroinitializer 1887 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1888 %2 = bitcast i8* %z to <8 x i16>* 1889 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) 1890 ret i8* %y 1891} 1892 1893define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) { 1894; CHECK-LABEL: strh16_256: 1895; CHECK: @ %bb.0: @ %entry 1896; CHECK-NEXT: vldrh.u16 q0, [r1] 1897; CHECK-NEXT: add.w r1, r0, #256 1898; CHECK-NEXT: vldrh.u16 q1, [r2] 1899; CHECK-NEXT: vpt.i16 ne, q1, zr 1900; CHECK-NEXT: vstrht.16 q0, [r1] 1901; CHECK-NEXT: bx lr 1902entry: 1903 %z = getelementptr inbounds i8, i8* %y, i32 256 1904 %0 = bitcast i8* %x to <8 x i16>* 1905 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1906 %c = icmp ne <8 x i16> %mask, zeroinitializer 1907 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1908 %2 = bitcast i8* %z to <8 x i16>* 1909 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) 1910 ret i8* %y 1911} 1912 1913define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) { 1914; CHECK-LABEL: strh16_m254: 1915; CHECK: @ %bb.0: @ %entry 1916; CHECK-NEXT: vldrh.u16 q0, [r1] 1917; CHECK-NEXT: vldrh.u16 q1, [r2] 1918; CHECK-NEXT: vpt.i16 ne, q1, zr 1919; CHECK-NEXT: vstrht.16 q0, [r0, #-254] 1920; CHECK-NEXT: bx lr 1921entry: 1922 %z = getelementptr inbounds i8, i8* %y, i32 -254 1923 %0 = bitcast i8* %x to <8 x i16>* 1924 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1925 %c = icmp ne <8 x i16> %mask, zeroinitializer 1926 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1927 %2 = bitcast i8* %z to <8 x i16>* 1928 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) 1929 ret i8* %y 1930} 1931 1932define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) { 1933; CHECK-LABEL: strh16_m256: 1934; CHECK: @ %bb.0: @ %entry 1935; CHECK-NEXT: vldrh.u16 q0, [r1] 1936; CHECK-NEXT: sub.w r1, r0, #256 1937; CHECK-NEXT: vldrh.u16 q1, [r2] 1938; CHECK-NEXT: vpt.i16 ne, q1, zr 1939; CHECK-NEXT: vstrht.16 q0, [r1] 1940; CHECK-NEXT: bx lr 1941entry: 1942 %z = getelementptr inbounds i8, i8* %y, i32 -256 1943 %0 = bitcast i8* %x to <8 x i16>* 1944 %mask = load <8 x i16>, <8 x i16>* %m, align 2 1945 %c = icmp ne <8 x i16> %mask, zeroinitializer 1946 %1 = load <8 x i16>, <8 x i16>* %0, align 2 1947 %2 = bitcast i8* %z to <8 x i16>* 1948 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) 1949 ret i8* %y 1950} 1951 1952define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) { 1953; CHECK-LABEL: strb32_4: 1954; CHECK: @ %bb.0: @ %entry 1955; CHECK-NEXT: vldrb.u32 q0, [r1] 1956; CHECK-NEXT: vldrw.u32 q1, [r2] 1957; CHECK-NEXT: vpt.i32 ne, q1, zr 1958; CHECK-NEXT: vstrbt.32 q0, [r0, #4] 1959; CHECK-NEXT: bx lr 1960entry: 1961 %z = getelementptr inbounds i8, i8* %y, i32 4 1962 %0 = bitcast i8* %x to <4 x i8>* 1963 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1964 %c = icmp ne <4 x i32> %mask, zeroinitializer 1965 %1 = load <4 x i8>, <4 x i8>* %0, align 1 1966 %2 = bitcast i8* %z to <4 x i8>* 1967 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) 1968 ret i8* %y 1969} 1970 1971define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) { 1972; CHECK-LABEL: strb32_3: 1973; CHECK: @ %bb.0: @ %entry 1974; CHECK-NEXT: vldrb.u32 q0, [r1] 1975; CHECK-NEXT: vldrw.u32 q1, [r2] 1976; CHECK-NEXT: vpt.i32 ne, q1, zr 1977; CHECK-NEXT: vstrbt.32 q0, [r0, #3] 1978; CHECK-NEXT: bx lr 1979entry: 1980 %z = getelementptr inbounds i8, i8* %y, i32 3 1981 %0 = bitcast i8* %x to <4 x i8>* 1982 %mask = load <4 x i32>, <4 x i32>* %m, align 4 1983 %c = icmp ne <4 x i32> %mask, zeroinitializer 1984 %1 = load <4 x i8>, <4 x i8>* %0, align 1 1985 %2 = bitcast i8* %z to <4 x i8>* 1986 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) 1987 ret i8* %y 1988} 1989 1990define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) { 1991; CHECK-LABEL: strb32_2: 1992; CHECK: @ %bb.0: @ %entry 1993; CHECK-NEXT: vldrb.u32 q0, [r1] 1994; CHECK-NEXT: vldrw.u32 q1, [r2] 1995; CHECK-NEXT: vpt.i32 ne, q1, zr 1996; CHECK-NEXT: vstrbt.32 q0, [r0, #2] 1997; CHECK-NEXT: bx lr 1998entry: 1999 %z = getelementptr inbounds i8, i8* %y, i32 2 2000 %0 = bitcast i8* %x to <4 x i8>* 2001 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2002 %c = icmp ne <4 x i32> %mask, zeroinitializer 2003 %1 = load <4 x i8>, <4 x i8>* %0, align 1 2004 %2 = bitcast i8* %z to <4 x i8>* 2005 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) 2006 ret i8* %y 2007} 2008 2009define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) { 2010; CHECK-LABEL: strb32_127: 2011; CHECK: @ %bb.0: @ %entry 2012; CHECK-NEXT: vldrb.u32 q0, [r1] 2013; CHECK-NEXT: vldrw.u32 q1, [r2] 2014; CHECK-NEXT: vpt.i32 ne, q1, zr 2015; CHECK-NEXT: vstrbt.32 q0, [r0, #127] 2016; CHECK-NEXT: bx lr 2017entry: 2018 %z = getelementptr inbounds i8, i8* %y, i32 127 2019 %0 = bitcast i8* %x to <4 x i8>* 2020 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2021 %c = icmp ne <4 x i32> %mask, zeroinitializer 2022 %1 = load <4 x i8>, <4 x i8>* %0, align 1 2023 %2 = bitcast i8* %z to <4 x i8>* 2024 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) 2025 ret i8* %y 2026} 2027 2028define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) { 2029; CHECK-LABEL: strb32_128: 2030; CHECK: @ %bb.0: @ %entry 2031; CHECK-NEXT: vldrb.u32 q0, [r1] 2032; CHECK-NEXT: add.w r1, r0, #128 2033; CHECK-NEXT: vldrw.u32 q1, [r2] 2034; CHECK-NEXT: vpt.i32 ne, q1, zr 2035; CHECK-NEXT: vstrbt.32 q0, [r1] 2036; CHECK-NEXT: bx lr 2037entry: 2038 %z = getelementptr inbounds i8, i8* %y, i32 128 2039 %0 = bitcast i8* %x to <4 x i8>* 2040 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2041 %c = icmp ne <4 x i32> %mask, zeroinitializer 2042 %1 = load <4 x i8>, <4 x i8>* %0, align 1 2043 %2 = bitcast i8* %z to <4 x i8>* 2044 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) 2045 ret i8* %y 2046} 2047 2048define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) { 2049; CHECK-LABEL: strb32_m127: 2050; CHECK: @ %bb.0: @ %entry 2051; CHECK-NEXT: vldrb.u32 q0, [r1] 2052; CHECK-NEXT: vldrw.u32 q1, [r2] 2053; CHECK-NEXT: vpt.i32 ne, q1, zr 2054; CHECK-NEXT: vstrbt.32 q0, [r0, #-127] 2055; CHECK-NEXT: bx lr 2056entry: 2057 %z = getelementptr inbounds i8, i8* %y, i32 -127 2058 %0 = bitcast i8* %x to <4 x i8>* 2059 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2060 %c = icmp ne <4 x i32> %mask, zeroinitializer 2061 %1 = load <4 x i8>, <4 x i8>* %0, align 1 2062 %2 = bitcast i8* %z to <4 x i8>* 2063 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) 2064 ret i8* %y 2065} 2066 2067define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) { 2068; CHECK-LABEL: strb32_m128: 2069; CHECK: @ %bb.0: @ %entry 2070; CHECK-NEXT: vldrb.u32 q0, [r1] 2071; CHECK-NEXT: sub.w r1, r0, #128 2072; CHECK-NEXT: vldrw.u32 q1, [r2] 2073; CHECK-NEXT: vpt.i32 ne, q1, zr 2074; CHECK-NEXT: vstrbt.32 q0, [r1] 2075; CHECK-NEXT: bx lr 2076entry: 2077 %z = getelementptr inbounds i8, i8* %y, i32 -128 2078 %0 = bitcast i8* %x to <4 x i8>* 2079 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2080 %c = icmp ne <4 x i32> %mask, zeroinitializer 2081 %1 = load <4 x i8>, <4 x i8>* %0, align 1 2082 %2 = bitcast i8* %z to <4 x i8>* 2083 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) 2084 ret i8* %y 2085} 2086 2087define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) { 2088; CHECK-LABEL: strb16_4: 2089; CHECK: @ %bb.0: @ %entry 2090; CHECK-NEXT: vldrb.u16 q0, [r1] 2091; CHECK-NEXT: vldrh.u16 q1, [r2] 2092; CHECK-NEXT: vpt.i16 ne, q1, zr 2093; CHECK-NEXT: vstrbt.16 q0, [r0, #4] 2094; CHECK-NEXT: bx lr 2095entry: 2096 %z = getelementptr inbounds i8, i8* %y, i32 4 2097 %0 = bitcast i8* %x to <8 x i8>* 2098 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2099 %c = icmp ne <8 x i16> %mask, zeroinitializer 2100 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2101 %2 = bitcast i8* %z to <8 x i8>* 2102 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) 2103 ret i8* %y 2104} 2105 2106define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) { 2107; CHECK-LABEL: strb16_3: 2108; CHECK: @ %bb.0: @ %entry 2109; CHECK-NEXT: vldrb.u16 q0, [r1] 2110; CHECK-NEXT: vldrh.u16 q1, [r2] 2111; CHECK-NEXT: vpt.i16 ne, q1, zr 2112; CHECK-NEXT: vstrbt.16 q0, [r0, #3] 2113; CHECK-NEXT: bx lr 2114entry: 2115 %z = getelementptr inbounds i8, i8* %y, i32 3 2116 %0 = bitcast i8* %x to <8 x i8>* 2117 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2118 %c = icmp ne <8 x i16> %mask, zeroinitializer 2119 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2120 %2 = bitcast i8* %z to <8 x i8>* 2121 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) 2122 ret i8* %y 2123} 2124 2125define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) { 2126; CHECK-LABEL: strb16_2: 2127; CHECK: @ %bb.0: @ %entry 2128; CHECK-NEXT: vldrb.u16 q0, [r1] 2129; CHECK-NEXT: vldrh.u16 q1, [r2] 2130; CHECK-NEXT: vpt.i16 ne, q1, zr 2131; CHECK-NEXT: vstrbt.16 q0, [r0, #2] 2132; CHECK-NEXT: bx lr 2133entry: 2134 %z = getelementptr inbounds i8, i8* %y, i32 2 2135 %0 = bitcast i8* %x to <8 x i8>* 2136 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2137 %c = icmp ne <8 x i16> %mask, zeroinitializer 2138 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2139 %2 = bitcast i8* %z to <8 x i8>* 2140 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) 2141 ret i8* %y 2142} 2143 2144define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) { 2145; CHECK-LABEL: strb16_127: 2146; CHECK: @ %bb.0: @ %entry 2147; CHECK-NEXT: vldrb.u16 q0, [r1] 2148; CHECK-NEXT: vldrh.u16 q1, [r2] 2149; CHECK-NEXT: vpt.i16 ne, q1, zr 2150; CHECK-NEXT: vstrbt.16 q0, [r0, #127] 2151; CHECK-NEXT: bx lr 2152entry: 2153 %z = getelementptr inbounds i8, i8* %y, i32 127 2154 %0 = bitcast i8* %x to <8 x i8>* 2155 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2156 %c = icmp ne <8 x i16> %mask, zeroinitializer 2157 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2158 %2 = bitcast i8* %z to <8 x i8>* 2159 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) 2160 ret i8* %y 2161} 2162 2163define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) { 2164; CHECK-LABEL: strb16_128: 2165; CHECK: @ %bb.0: @ %entry 2166; CHECK-NEXT: vldrb.u16 q0, [r1] 2167; CHECK-NEXT: add.w r1, r0, #128 2168; CHECK-NEXT: vldrh.u16 q1, [r2] 2169; CHECK-NEXT: vpt.i16 ne, q1, zr 2170; CHECK-NEXT: vstrbt.16 q0, [r1] 2171; CHECK-NEXT: bx lr 2172entry: 2173 %z = getelementptr inbounds i8, i8* %y, i32 128 2174 %0 = bitcast i8* %x to <8 x i8>* 2175 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2176 %c = icmp ne <8 x i16> %mask, zeroinitializer 2177 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2178 %2 = bitcast i8* %z to <8 x i8>* 2179 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) 2180 ret i8* %y 2181} 2182 2183define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) { 2184; CHECK-LABEL: strb16_m127: 2185; CHECK: @ %bb.0: @ %entry 2186; CHECK-NEXT: vldrb.u16 q0, [r1] 2187; CHECK-NEXT: vldrh.u16 q1, [r2] 2188; CHECK-NEXT: vpt.i16 ne, q1, zr 2189; CHECK-NEXT: vstrbt.16 q0, [r0, #-127] 2190; CHECK-NEXT: bx lr 2191entry: 2192 %z = getelementptr inbounds i8, i8* %y, i32 -127 2193 %0 = bitcast i8* %x to <8 x i8>* 2194 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2195 %c = icmp ne <8 x i16> %mask, zeroinitializer 2196 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2197 %2 = bitcast i8* %z to <8 x i8>* 2198 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) 2199 ret i8* %y 2200} 2201 2202define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) { 2203; CHECK-LABEL: strb16_m128: 2204; CHECK: @ %bb.0: @ %entry 2205; CHECK-NEXT: vldrb.u16 q0, [r1] 2206; CHECK-NEXT: sub.w r1, r0, #128 2207; CHECK-NEXT: vldrh.u16 q1, [r2] 2208; CHECK-NEXT: vpt.i16 ne, q1, zr 2209; CHECK-NEXT: vstrbt.16 q0, [r1] 2210; CHECK-NEXT: bx lr 2211entry: 2212 %z = getelementptr inbounds i8, i8* %y, i32 -128 2213 %0 = bitcast i8* %x to <8 x i8>* 2214 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2215 %c = icmp ne <8 x i16> %mask, zeroinitializer 2216 %1 = load <8 x i8>, <8 x i8>* %0, align 1 2217 %2 = bitcast i8* %z to <8 x i8>* 2218 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) 2219 ret i8* %y 2220} 2221 2222define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) { 2223; CHECK-LABEL: strb8_4: 2224; CHECK: @ %bb.0: @ %entry 2225; CHECK-NEXT: vldrb.u8 q0, [r1] 2226; CHECK-NEXT: vldrb.u8 q1, [r2] 2227; CHECK-NEXT: vpt.i8 ne, q1, zr 2228; CHECK-NEXT: vstrbt.8 q0, [r0, #4] 2229; CHECK-NEXT: bx lr 2230entry: 2231 %z = getelementptr inbounds i8, i8* %y, i32 4 2232 %0 = bitcast i8* %x to <16 x i8>* 2233 %mask = load <16 x i8>, <16 x i8>* %m, align 1 2234 %c = icmp ne <16 x i8> %mask, zeroinitializer 2235 %1 = load <16 x i8>, <16 x i8>* %0, align 1 2236 %2 = bitcast i8* %z to <16 x i8>* 2237 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) 2238 ret i8* %y 2239} 2240 2241define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) { 2242; CHECK-LABEL: strb8_3: 2243; CHECK: @ %bb.0: @ %entry 2244; CHECK-NEXT: vldrb.u8 q0, [r1] 2245; CHECK-NEXT: vldrb.u8 q1, [r2] 2246; CHECK-NEXT: vpt.i8 ne, q1, zr 2247; CHECK-NEXT: vstrbt.8 q0, [r0, #3] 2248; CHECK-NEXT: bx lr 2249entry: 2250 %z = getelementptr inbounds i8, i8* %y, i32 3 2251 %0 = bitcast i8* %x to <16 x i8>* 2252 %mask = load <16 x i8>, <16 x i8>* %m, align 1 2253 %c = icmp ne <16 x i8> %mask, zeroinitializer 2254 %1 = load <16 x i8>, <16 x i8>* %0, align 1 2255 %2 = bitcast i8* %z to <16 x i8>* 2256 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) 2257 ret i8* %y 2258} 2259 2260define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) { 2261; CHECK-LABEL: strb8_2: 2262; CHECK: @ %bb.0: @ %entry 2263; CHECK-NEXT: vldrb.u8 q0, [r1] 2264; CHECK-NEXT: vldrb.u8 q1, [r2] 2265; CHECK-NEXT: vpt.i8 ne, q1, zr 2266; CHECK-NEXT: vstrbt.8 q0, [r0, #2] 2267; CHECK-NEXT: bx lr 2268entry: 2269 %z = getelementptr inbounds i8, i8* %y, i32 2 2270 %0 = bitcast i8* %x to <16 x i8>* 2271 %mask = load <16 x i8>, <16 x i8>* %m, align 1 2272 %c = icmp ne <16 x i8> %mask, zeroinitializer 2273 %1 = load <16 x i8>, <16 x i8>* %0, align 1 2274 %2 = bitcast i8* %z to <16 x i8>* 2275 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) 2276 ret i8* %y 2277} 2278 2279define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) { 2280; CHECK-LABEL: strb8_127: 2281; CHECK: @ %bb.0: @ %entry 2282; CHECK-NEXT: vldrb.u8 q0, [r1] 2283; CHECK-NEXT: vldrb.u8 q1, [r2] 2284; CHECK-NEXT: vpt.i8 ne, q1, zr 2285; CHECK-NEXT: vstrbt.8 q0, [r0, #127] 2286; CHECK-NEXT: bx lr 2287entry: 2288 %z = getelementptr inbounds i8, i8* %y, i32 127 2289 %0 = bitcast i8* %x to <16 x i8>* 2290 %mask = load <16 x i8>, <16 x i8>* %m, align 1 2291 %c = icmp ne <16 x i8> %mask, zeroinitializer 2292 %1 = load <16 x i8>, <16 x i8>* %0, align 1 2293 %2 = bitcast i8* %z to <16 x i8>* 2294 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) 2295 ret i8* %y 2296} 2297 2298define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) { 2299; CHECK-LABEL: strb8_128: 2300; CHECK: @ %bb.0: @ %entry 2301; CHECK-NEXT: vldrb.u8 q0, [r1] 2302; CHECK-NEXT: add.w r1, r0, #128 2303; CHECK-NEXT: vldrb.u8 q1, [r2] 2304; CHECK-NEXT: vpt.i8 ne, q1, zr 2305; CHECK-NEXT: vstrbt.8 q0, [r1] 2306; CHECK-NEXT: bx lr 2307entry: 2308 %z = getelementptr inbounds i8, i8* %y, i32 128 2309 %0 = bitcast i8* %x to <16 x i8>* 2310 %mask = load <16 x i8>, <16 x i8>* %m, align 1 2311 %c = icmp ne <16 x i8> %mask, zeroinitializer 2312 %1 = load <16 x i8>, <16 x i8>* %0, align 1 2313 %2 = bitcast i8* %z to <16 x i8>* 2314 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) 2315 ret i8* %y 2316} 2317 2318define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) { 2319; CHECK-LABEL: strb8_m127: 2320; CHECK: @ %bb.0: @ %entry 2321; CHECK-NEXT: vldrb.u8 q0, [r1] 2322; CHECK-NEXT: vldrb.u8 q1, [r2] 2323; CHECK-NEXT: vpt.i8 ne, q1, zr 2324; CHECK-NEXT: vstrbt.8 q0, [r0, #-127] 2325; CHECK-NEXT: bx lr 2326entry: 2327 %z = getelementptr inbounds i8, i8* %y, i32 -127 2328 %0 = bitcast i8* %x to <16 x i8>* 2329 %mask = load <16 x i8>, <16 x i8>* %m, align 1 2330 %c = icmp ne <16 x i8> %mask, zeroinitializer 2331 %1 = load <16 x i8>, <16 x i8>* %0, align 1 2332 %2 = bitcast i8* %z to <16 x i8>* 2333 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) 2334 ret i8* %y 2335} 2336 2337define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) { 2338; CHECK-LABEL: strb8_m128: 2339; CHECK: @ %bb.0: @ %entry 2340; CHECK-NEXT: vldrb.u8 q0, [r1] 2341; CHECK-NEXT: sub.w r1, r0, #128 2342; CHECK-NEXT: vldrb.u8 q1, [r2] 2343; CHECK-NEXT: vpt.i8 ne, q1, zr 2344; CHECK-NEXT: vstrbt.8 q0, [r1] 2345; CHECK-NEXT: bx lr 2346entry: 2347 %z = getelementptr inbounds i8, i8* %y, i32 -128 2348 %0 = bitcast i8* %x to <16 x i8>* 2349 %mask = load <16 x i8>, <16 x i8>* %m, align 1 2350 %c = icmp ne <16 x i8> %mask, zeroinitializer 2351 %1 = load <16 x i8>, <16 x i8>* %0, align 1 2352 %2 = bitcast i8* %z to <16 x i8>* 2353 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) 2354 ret i8* %y 2355} 2356 2357define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) { 2358; CHECK-LABEL: strwf32_4: 2359; CHECK: @ %bb.0: @ %entry 2360; CHECK-NEXT: vldrw.u32 q0, [r1] 2361; CHECK-NEXT: vldrw.u32 q1, [r2] 2362; CHECK-NEXT: vpt.i32 ne, q1, zr 2363; CHECK-NEXT: vstrwt.32 q0, [r0, #4] 2364; CHECK-NEXT: bx lr 2365entry: 2366 %z = getelementptr inbounds i8, i8* %y, i32 4 2367 %0 = bitcast i8* %x to <4 x float>* 2368 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2369 %c = icmp ne <4 x i32> %mask, zeroinitializer 2370 %1 = load <4 x float>, <4 x float>* %0, align 4 2371 %2 = bitcast i8* %z to <4 x float>* 2372 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) 2373 ret i8* %y 2374} 2375 2376define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) { 2377; CHECK-LABEL: strwf32_3: 2378; CHECK: @ %bb.0: @ %entry 2379; CHECK-NEXT: vldrw.u32 q0, [r1] 2380; CHECK-NEXT: adds r1, r0, #3 2381; CHECK-NEXT: vldrw.u32 q1, [r2] 2382; CHECK-NEXT: vpt.i32 ne, q1, zr 2383; CHECK-NEXT: vstrwt.32 q0, [r1] 2384; CHECK-NEXT: bx lr 2385entry: 2386 %z = getelementptr inbounds i8, i8* %y, i32 3 2387 %0 = bitcast i8* %x to <4 x float>* 2388 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2389 %c = icmp ne <4 x i32> %mask, zeroinitializer 2390 %1 = load <4 x float>, <4 x float>* %0, align 4 2391 %2 = bitcast i8* %z to <4 x float>* 2392 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) 2393 ret i8* %y 2394} 2395 2396define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) { 2397; CHECK-LABEL: strwf32_2: 2398; CHECK: @ %bb.0: @ %entry 2399; CHECK-NEXT: vldrw.u32 q0, [r1] 2400; CHECK-NEXT: adds r1, r0, #2 2401; CHECK-NEXT: vldrw.u32 q1, [r2] 2402; CHECK-NEXT: vpt.i32 ne, q1, zr 2403; CHECK-NEXT: vstrwt.32 q0, [r1] 2404; CHECK-NEXT: bx lr 2405entry: 2406 %z = getelementptr inbounds i8, i8* %y, i32 2 2407 %0 = bitcast i8* %x to <4 x float>* 2408 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2409 %c = icmp ne <4 x i32> %mask, zeroinitializer 2410 %1 = load <4 x float>, <4 x float>* %0, align 4 2411 %2 = bitcast i8* %z to <4 x float>* 2412 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) 2413 ret i8* %y 2414} 2415 2416define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) { 2417; CHECK-LABEL: strwf32_508: 2418; CHECK: @ %bb.0: @ %entry 2419; CHECK-NEXT: vldrw.u32 q0, [r1] 2420; CHECK-NEXT: vldrw.u32 q1, [r2] 2421; CHECK-NEXT: vpt.i32 ne, q1, zr 2422; CHECK-NEXT: vstrwt.32 q0, [r0, #508] 2423; CHECK-NEXT: bx lr 2424entry: 2425 %z = getelementptr inbounds i8, i8* %y, i32 508 2426 %0 = bitcast i8* %x to <4 x float>* 2427 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2428 %c = icmp ne <4 x i32> %mask, zeroinitializer 2429 %1 = load <4 x float>, <4 x float>* %0, align 4 2430 %2 = bitcast i8* %z to <4 x float>* 2431 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) 2432 ret i8* %y 2433} 2434 2435define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) { 2436; CHECK-LABEL: strwf32_512: 2437; CHECK: @ %bb.0: @ %entry 2438; CHECK-NEXT: vldrw.u32 q0, [r1] 2439; CHECK-NEXT: add.w r1, r0, #512 2440; CHECK-NEXT: vldrw.u32 q1, [r2] 2441; CHECK-NEXT: vpt.i32 ne, q1, zr 2442; CHECK-NEXT: vstrwt.32 q0, [r1] 2443; CHECK-NEXT: bx lr 2444entry: 2445 %z = getelementptr inbounds i8, i8* %y, i32 512 2446 %0 = bitcast i8* %x to <4 x float>* 2447 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2448 %c = icmp ne <4 x i32> %mask, zeroinitializer 2449 %1 = load <4 x float>, <4 x float>* %0, align 4 2450 %2 = bitcast i8* %z to <4 x float>* 2451 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) 2452 ret i8* %y 2453} 2454 2455define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) { 2456; CHECK-LABEL: strwf32_m508: 2457; CHECK: @ %bb.0: @ %entry 2458; CHECK-NEXT: vldrw.u32 q0, [r1] 2459; CHECK-NEXT: vldrw.u32 q1, [r2] 2460; CHECK-NEXT: vpt.i32 ne, q1, zr 2461; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] 2462; CHECK-NEXT: bx lr 2463entry: 2464 %z = getelementptr inbounds i8, i8* %y, i32 -508 2465 %0 = bitcast i8* %x to <4 x float>* 2466 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2467 %c = icmp ne <4 x i32> %mask, zeroinitializer 2468 %1 = load <4 x float>, <4 x float>* %0, align 4 2469 %2 = bitcast i8* %z to <4 x float>* 2470 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) 2471 ret i8* %y 2472} 2473 2474define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) { 2475; CHECK-LABEL: strwf32_m512: 2476; CHECK: @ %bb.0: @ %entry 2477; CHECK-NEXT: vldrw.u32 q0, [r1] 2478; CHECK-NEXT: sub.w r1, r0, #512 2479; CHECK-NEXT: vldrw.u32 q1, [r2] 2480; CHECK-NEXT: vpt.i32 ne, q1, zr 2481; CHECK-NEXT: vstrwt.32 q0, [r1] 2482; CHECK-NEXT: bx lr 2483entry: 2484 %z = getelementptr inbounds i8, i8* %y, i32 -512 2485 %0 = bitcast i8* %x to <4 x float>* 2486 %mask = load <4 x i32>, <4 x i32>* %m, align 4 2487 %c = icmp ne <4 x i32> %mask, zeroinitializer 2488 %1 = load <4 x float>, <4 x float>* %0, align 4 2489 %2 = bitcast i8* %z to <4 x float>* 2490 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) 2491 ret i8* %y 2492} 2493 2494define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) { 2495; CHECK-LABEL: strhf16_4: 2496; CHECK: @ %bb.0: @ %entry 2497; CHECK-NEXT: vldrh.u16 q0, [r1] 2498; CHECK-NEXT: vldrh.u16 q1, [r2] 2499; CHECK-NEXT: vpt.i16 ne, q1, zr 2500; CHECK-NEXT: vstrht.16 q0, [r0, #4] 2501; CHECK-NEXT: bx lr 2502entry: 2503 %z = getelementptr inbounds i8, i8* %y, i32 4 2504 %0 = bitcast i8* %x to <8 x half>* 2505 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2506 %c = icmp ne <8 x i16> %mask, zeroinitializer 2507 %1 = load <8 x half>, <8 x half>* %0, align 2 2508 %2 = bitcast i8* %z to <8 x half>* 2509 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) 2510 ret i8* %y 2511} 2512 2513define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) { 2514; CHECK-LABEL: strhf16_3: 2515; CHECK: @ %bb.0: @ %entry 2516; CHECK-NEXT: vldrh.u16 q0, [r1] 2517; CHECK-NEXT: adds r1, r0, #3 2518; CHECK-NEXT: vldrh.u16 q1, [r2] 2519; CHECK-NEXT: vpt.i16 ne, q1, zr 2520; CHECK-NEXT: vstrht.16 q0, [r1] 2521; CHECK-NEXT: bx lr 2522entry: 2523 %z = getelementptr inbounds i8, i8* %y, i32 3 2524 %0 = bitcast i8* %x to <8 x half>* 2525 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2526 %c = icmp ne <8 x i16> %mask, zeroinitializer 2527 %1 = load <8 x half>, <8 x half>* %0, align 2 2528 %2 = bitcast i8* %z to <8 x half>* 2529 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) 2530 ret i8* %y 2531} 2532 2533define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) { 2534; CHECK-LABEL: strhf16_2: 2535; CHECK: @ %bb.0: @ %entry 2536; CHECK-NEXT: vldrh.u16 q0, [r1] 2537; CHECK-NEXT: vldrh.u16 q1, [r2] 2538; CHECK-NEXT: vpt.i16 ne, q1, zr 2539; CHECK-NEXT: vstrht.16 q0, [r0, #2] 2540; CHECK-NEXT: bx lr 2541entry: 2542 %z = getelementptr inbounds i8, i8* %y, i32 2 2543 %0 = bitcast i8* %x to <8 x half>* 2544 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2545 %c = icmp ne <8 x i16> %mask, zeroinitializer 2546 %1 = load <8 x half>, <8 x half>* %0, align 2 2547 %2 = bitcast i8* %z to <8 x half>* 2548 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) 2549 ret i8* %y 2550} 2551 2552define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) { 2553; CHECK-LABEL: strhf16_254: 2554; CHECK: @ %bb.0: @ %entry 2555; CHECK-NEXT: vldrh.u16 q0, [r1] 2556; CHECK-NEXT: vldrh.u16 q1, [r2] 2557; CHECK-NEXT: vpt.i16 ne, q1, zr 2558; CHECK-NEXT: vstrht.16 q0, [r0, #254] 2559; CHECK-NEXT: bx lr 2560entry: 2561 %z = getelementptr inbounds i8, i8* %y, i32 254 2562 %0 = bitcast i8* %x to <8 x half>* 2563 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2564 %c = icmp ne <8 x i16> %mask, zeroinitializer 2565 %1 = load <8 x half>, <8 x half>* %0, align 2 2566 %2 = bitcast i8* %z to <8 x half>* 2567 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) 2568 ret i8* %y 2569} 2570 2571define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) { 2572; CHECK-LABEL: strhf16_256: 2573; CHECK: @ %bb.0: @ %entry 2574; CHECK-NEXT: vldrh.u16 q0, [r1] 2575; CHECK-NEXT: add.w r1, r0, #256 2576; CHECK-NEXT: vldrh.u16 q1, [r2] 2577; CHECK-NEXT: vpt.i16 ne, q1, zr 2578; CHECK-NEXT: vstrht.16 q0, [r1] 2579; CHECK-NEXT: bx lr 2580entry: 2581 %z = getelementptr inbounds i8, i8* %y, i32 256 2582 %0 = bitcast i8* %x to <8 x half>* 2583 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2584 %c = icmp ne <8 x i16> %mask, zeroinitializer 2585 %1 = load <8 x half>, <8 x half>* %0, align 2 2586 %2 = bitcast i8* %z to <8 x half>* 2587 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) 2588 ret i8* %y 2589} 2590 2591define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) { 2592; CHECK-LABEL: strhf16_m254: 2593; CHECK: @ %bb.0: @ %entry 2594; CHECK-NEXT: vldrh.u16 q0, [r1] 2595; CHECK-NEXT: vldrh.u16 q1, [r2] 2596; CHECK-NEXT: vpt.i16 ne, q1, zr 2597; CHECK-NEXT: vstrht.16 q0, [r0, #-254] 2598; CHECK-NEXT: bx lr 2599entry: 2600 %z = getelementptr inbounds i8, i8* %y, i32 -254 2601 %0 = bitcast i8* %x to <8 x half>* 2602 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2603 %c = icmp ne <8 x i16> %mask, zeroinitializer 2604 %1 = load <8 x half>, <8 x half>* %0, align 2 2605 %2 = bitcast i8* %z to <8 x half>* 2606 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) 2607 ret i8* %y 2608} 2609 2610define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) { 2611; CHECK-LABEL: strhf16_m256: 2612; CHECK: @ %bb.0: @ %entry 2613; CHECK-NEXT: vldrh.u16 q0, [r1] 2614; CHECK-NEXT: sub.w r1, r0, #256 2615; CHECK-NEXT: vldrh.u16 q1, [r2] 2616; CHECK-NEXT: vpt.i16 ne, q1, zr 2617; CHECK-NEXT: vstrht.16 q0, [r1] 2618; CHECK-NEXT: bx lr 2619entry: 2620 %z = getelementptr inbounds i8, i8* %y, i32 -256 2621 %0 = bitcast i8* %x to <8 x half>* 2622 %mask = load <8 x i16>, <8 x i16>* %m, align 2 2623 %c = icmp ne <8 x i16> %mask, zeroinitializer 2624 %1 = load <8 x half>, <8 x half>* %0, align 2 2625 %2 = bitcast i8* %z to <8 x half>* 2626 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) 2627 ret i8* %y 2628} 2629 2630declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) 2631declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) 2632declare <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) 2633declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) 2634declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) 2635declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) 2636declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) 2637declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) 2638 2639declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) 2640declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) 2641declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) 2642declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) 2643declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) 2644declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) 2645declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) 2646declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) 2647