1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s 2 3define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind { 4;CHECK-LABEL: subhn8b: 5;CHECK: subhn.8b 6 %tmp1 = load <8 x i16>, <8 x i16>* %A 7 %tmp2 = load <8 x i16>, <8 x i16>* %B 8 %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 9 ret <8 x i8> %tmp3 10} 11 12define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind { 13;CHECK-LABEL: subhn4h: 14;CHECK: subhn.4h 15 %tmp1 = load <4 x i32>, <4 x i32>* %A 16 %tmp2 = load <4 x i32>, <4 x i32>* %B 17 %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 18 ret <4 x i16> %tmp3 19} 20 21define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind { 22;CHECK-LABEL: subhn2s: 23;CHECK: subhn.2s 24 %tmp1 = load <2 x i64>, <2 x i64>* %A 25 %tmp2 = load <2 x i64>, <2 x i64>* %B 26 %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 27 ret <2 x i32> %tmp3 28} 29 30define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 31;CHECK-LABEL: subhn2_16b: 32;CHECK: subhn.8b 33;CHECK-NEXT: subhn2.16b 34 %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 35 %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 36 %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 37 ret <16 x i8> %res 38} 39 40define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 41;CHECK-LABEL: subhn2_8h: 42;CHECK: subhn.4h 43;CHECK-NEXT: subhn2.8h 44 %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 45 %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 46 %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 47 ret <8 x i16> %res 48} 49 50define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 51;CHECK-LABEL: subhn2_4s: 52;CHECK: subhn.2s 53;CHECK-NEXT: subhn2.4s 54 %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 55 %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 56 %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 57 ret <4 x i32> %res 58} 59 60declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 61declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 62declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 63 64define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind { 65;CHECK-LABEL: rsubhn8b: 66;CHECK: rsubhn.8b 67 %tmp1 = load <8 x i16>, <8 x i16>* %A 68 %tmp2 = load <8 x i16>, <8 x i16>* %B 69 %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 70 ret <8 x i8> %tmp3 71} 72 73define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind { 74;CHECK-LABEL: rsubhn4h: 75;CHECK: rsubhn.4h 76 %tmp1 = load <4 x i32>, <4 x i32>* %A 77 %tmp2 = load <4 x i32>, <4 x i32>* %B 78 %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 79 ret <4 x i16> %tmp3 80} 81 82define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind { 83;CHECK-LABEL: rsubhn2s: 84;CHECK: rsubhn.2s 85 %tmp1 = load <2 x i64>, <2 x i64>* %A 86 %tmp2 = load <2 x i64>, <2 x i64>* %B 87 %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 88 ret <2 x i32> %tmp3 89} 90 91define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 92;CHECK-LABEL: rsubhn2_16b: 93;CHECK: rsubhn.8b 94;CHECK-NEXT: rsubhn2.16b 95 %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 96 %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 97 %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 98 ret <16 x i8> %res 99} 100 101define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 102;CHECK-LABEL: rsubhn2_8h: 103;CHECK: rsubhn.4h 104;CHECK-NEXT: rsubhn2.8h 105 %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 106 %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 107 %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 108 ret <8 x i16> %res 109} 110 111define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 112;CHECK-LABEL: rsubhn2_4s: 113;CHECK: rsubhn.2s 114;CHECK-NEXT: rsubhn2.4s 115 %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 116 %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 117 %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 118 ret <4 x i32> %res 119} 120 121declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 122declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 123declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 124 125define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 126;CHECK-LABEL: ssubl8h: 127;CHECK: ssubl.8h 128 %tmp1 = load <8 x i8>, <8 x i8>* %A 129 %tmp2 = load <8 x i8>, <8 x i8>* %B 130 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> 131 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 132 %tmp5 = sub <8 x i16> %tmp3, %tmp4 133 ret <8 x i16> %tmp5 134} 135 136define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 137;CHECK-LABEL: ssubl4s: 138;CHECK: ssubl.4s 139 %tmp1 = load <4 x i16>, <4 x i16>* %A 140 %tmp2 = load <4 x i16>, <4 x i16>* %B 141 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> 142 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 143 %tmp5 = sub <4 x i32> %tmp3, %tmp4 144 ret <4 x i32> %tmp5 145} 146 147define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 148;CHECK-LABEL: ssubl2d: 149;CHECK: ssubl.2d 150 %tmp1 = load <2 x i32>, <2 x i32>* %A 151 %tmp2 = load <2 x i32>, <2 x i32>* %B 152 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> 153 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 154 %tmp5 = sub <2 x i64> %tmp3, %tmp4 155 ret <2 x i64> %tmp5 156} 157 158define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 159;CHECK-LABEL: ssubl2_8h: 160;CHECK: ssubl.8h 161 %tmp1 = load <16 x i8>, <16 x i8>* %A 162 %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 163 %ext1 = sext <8 x i8> %high1 to <8 x i16> 164 165 %tmp2 = load <16 x i8>, <16 x i8>* %B 166 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 167 %ext2 = sext <8 x i8> %high2 to <8 x i16> 168 169 %res = sub <8 x i16> %ext1, %ext2 170 ret <8 x i16> %res 171} 172 173define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 174;CHECK-LABEL: ssubl2_4s: 175;CHECK: ssubl.4s 176 %tmp1 = load <8 x i16>, <8 x i16>* %A 177 %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 178 %ext1 = sext <4 x i16> %high1 to <4 x i32> 179 180 %tmp2 = load <8 x i16>, <8 x i16>* %B 181 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 182 %ext2 = sext <4 x i16> %high2 to <4 x i32> 183 184 %res = sub <4 x i32> %ext1, %ext2 185 ret <4 x i32> %res 186} 187 188define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 189;CHECK-LABEL: ssubl2_2d: 190;CHECK: ssubl.2d 191 %tmp1 = load <4 x i32>, <4 x i32>* %A 192 %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 193 %ext1 = sext <2 x i32> %high1 to <2 x i64> 194 195 %tmp2 = load <4 x i32>, <4 x i32>* %B 196 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 197 %ext2 = sext <2 x i32> %high2 to <2 x i64> 198 199 %res = sub <2 x i64> %ext1, %ext2 200 ret <2 x i64> %res 201} 202 203define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 204;CHECK-LABEL: usubl8h: 205;CHECK: usubl.8h 206 %tmp1 = load <8 x i8>, <8 x i8>* %A 207 %tmp2 = load <8 x i8>, <8 x i8>* %B 208 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> 209 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 210 %tmp5 = sub <8 x i16> %tmp3, %tmp4 211 ret <8 x i16> %tmp5 212} 213 214define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 215;CHECK-LABEL: usubl4s: 216;CHECK: usubl.4s 217 %tmp1 = load <4 x i16>, <4 x i16>* %A 218 %tmp2 = load <4 x i16>, <4 x i16>* %B 219 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> 220 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 221 %tmp5 = sub <4 x i32> %tmp3, %tmp4 222 ret <4 x i32> %tmp5 223} 224 225define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 226;CHECK-LABEL: usubl2d: 227;CHECK: usubl.2d 228 %tmp1 = load <2 x i32>, <2 x i32>* %A 229 %tmp2 = load <2 x i32>, <2 x i32>* %B 230 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> 231 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 232 %tmp5 = sub <2 x i64> %tmp3, %tmp4 233 ret <2 x i64> %tmp5 234} 235 236define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 237;CHECK-LABEL: usubl2_8h: 238;CHECK: usubl.8h 239 %tmp1 = load <16 x i8>, <16 x i8>* %A 240 %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 241 %ext1 = zext <8 x i8> %high1 to <8 x i16> 242 243 %tmp2 = load <16 x i8>, <16 x i8>* %B 244 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 245 %ext2 = zext <8 x i8> %high2 to <8 x i16> 246 247 %res = sub <8 x i16> %ext1, %ext2 248 ret <8 x i16> %res 249} 250 251define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 252;CHECK-LABEL: usubl2_4s: 253;CHECK: usubl.4s 254 %tmp1 = load <8 x i16>, <8 x i16>* %A 255 %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 256 %ext1 = zext <4 x i16> %high1 to <4 x i32> 257 258 %tmp2 = load <8 x i16>, <8 x i16>* %B 259 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 260 %ext2 = zext <4 x i16> %high2 to <4 x i32> 261 262 %res = sub <4 x i32> %ext1, %ext2 263 ret <4 x i32> %res 264} 265 266define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 267;CHECK-LABEL: usubl2_2d: 268;CHECK: usubl.2d 269 %tmp1 = load <4 x i32>, <4 x i32>* %A 270 %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 271 %ext1 = zext <2 x i32> %high1 to <2 x i64> 272 273 %tmp2 = load <4 x i32>, <4 x i32>* %B 274 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 275 %ext2 = zext <2 x i32> %high2 to <2 x i64> 276 277 %res = sub <2 x i64> %ext1, %ext2 278 ret <2 x i64> %res 279} 280 281define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind { 282;CHECK-LABEL: ssubw8h: 283;CHECK: ssubw.8h 284 %tmp1 = load <8 x i16>, <8 x i16>* %A 285 %tmp2 = load <8 x i8>, <8 x i8>* %B 286 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16> 287 %tmp4 = sub <8 x i16> %tmp1, %tmp3 288 ret <8 x i16> %tmp4 289} 290 291define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind { 292;CHECK-LABEL: ssubw4s: 293;CHECK: ssubw.4s 294 %tmp1 = load <4 x i32>, <4 x i32>* %A 295 %tmp2 = load <4 x i16>, <4 x i16>* %B 296 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32> 297 %tmp4 = sub <4 x i32> %tmp1, %tmp3 298 ret <4 x i32> %tmp4 299} 300 301define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { 302;CHECK-LABEL: ssubw2d: 303;CHECK: ssubw.2d 304 %tmp1 = load <2 x i64>, <2 x i64>* %A 305 %tmp2 = load <2 x i32>, <2 x i32>* %B 306 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64> 307 %tmp4 = sub <2 x i64> %tmp1, %tmp3 308 ret <2 x i64> %tmp4 309} 310 311define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { 312;CHECK-LABEL: ssubw2_8h: 313;CHECK: ssubw.8h 314 %tmp1 = load <8 x i16>, <8 x i16>* %A 315 316 %tmp2 = load <16 x i8>, <16 x i8>* %B 317 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 318 %ext2 = sext <8 x i8> %high2 to <8 x i16> 319 320 %res = sub <8 x i16> %tmp1, %ext2 321 ret <8 x i16> %res 322} 323 324define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { 325;CHECK-LABEL: ssubw2_4s: 326;CHECK: ssubw.4s 327 %tmp1 = load <4 x i32>, <4 x i32>* %A 328 329 %tmp2 = load <8 x i16>, <8 x i16>* %B 330 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 331 %ext2 = sext <4 x i16> %high2 to <4 x i32> 332 333 %res = sub <4 x i32> %tmp1, %ext2 334 ret <4 x i32> %res 335} 336 337define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { 338;CHECK-LABEL: ssubw2_2d: 339;CHECK: ssubw.2d 340 %tmp1 = load <2 x i64>, <2 x i64>* %A 341 342 %tmp2 = load <4 x i32>, <4 x i32>* %B 343 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 344 %ext2 = sext <2 x i32> %high2 to <2 x i64> 345 346 %res = sub <2 x i64> %tmp1, %ext2 347 ret <2 x i64> %res 348} 349 350define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind { 351;CHECK-LABEL: usubw8h: 352;CHECK: usubw.8h 353 %tmp1 = load <8 x i16>, <8 x i16>* %A 354 %tmp2 = load <8 x i8>, <8 x i8>* %B 355 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16> 356 %tmp4 = sub <8 x i16> %tmp1, %tmp3 357 ret <8 x i16> %tmp4 358} 359 360define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind { 361;CHECK-LABEL: usubw4s: 362;CHECK: usubw.4s 363 %tmp1 = load <4 x i32>, <4 x i32>* %A 364 %tmp2 = load <4 x i16>, <4 x i16>* %B 365 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32> 366 %tmp4 = sub <4 x i32> %tmp1, %tmp3 367 ret <4 x i32> %tmp4 368} 369 370define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { 371;CHECK-LABEL: usubw2d: 372;CHECK: usubw.2d 373 %tmp1 = load <2 x i64>, <2 x i64>* %A 374 %tmp2 = load <2 x i32>, <2 x i32>* %B 375 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64> 376 %tmp4 = sub <2 x i64> %tmp1, %tmp3 377 ret <2 x i64> %tmp4 378} 379 380define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { 381;CHECK-LABEL: usubw2_8h: 382;CHECK: usubw.8h 383 %tmp1 = load <8 x i16>, <8 x i16>* %A 384 385 %tmp2 = load <16 x i8>, <16 x i8>* %B 386 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 387 %ext2 = zext <8 x i8> %high2 to <8 x i16> 388 389 %res = sub <8 x i16> %tmp1, %ext2 390 ret <8 x i16> %res 391} 392 393define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { 394;CHECK-LABEL: usubw2_4s: 395;CHECK: usubw.4s 396 %tmp1 = load <4 x i32>, <4 x i32>* %A 397 398 %tmp2 = load <8 x i16>, <8 x i16>* %B 399 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 400 %ext2 = zext <4 x i16> %high2 to <4 x i32> 401 402 %res = sub <4 x i32> %tmp1, %ext2 403 ret <4 x i32> %res 404} 405 406define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { 407;CHECK-LABEL: usubw2_2d: 408;CHECK: usubw.2d 409 %tmp1 = load <2 x i64>, <2 x i64>* %A 410 411 %tmp2 = load <4 x i32>, <4 x i32>* %B 412 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 413 %ext2 = zext <2 x i32> %high2 to <2 x i64> 414 415 %res = sub <2 x i64> %tmp1, %ext2 416 ret <2 x i64> %res 417} 418