1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
2
3define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
4;CHECK-LABEL: subhn8b:
5;CHECK: subhn.8b
6        %tmp1 = load <8 x i16>, <8 x i16>* %A
7        %tmp2 = load <8 x i16>, <8 x i16>* %B
8        %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
9        ret <8 x i8> %tmp3
10}
11
12define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
13;CHECK-LABEL: subhn4h:
14;CHECK: subhn.4h
15        %tmp1 = load <4 x i32>, <4 x i32>* %A
16        %tmp2 = load <4 x i32>, <4 x i32>* %B
17        %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
18        ret <4 x i16> %tmp3
19}
20
21define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
22;CHECK-LABEL: subhn2s:
23;CHECK: subhn.2s
24        %tmp1 = load <2 x i64>, <2 x i64>* %A
25        %tmp2 = load <2 x i64>, <2 x i64>* %B
26        %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
27        ret <2 x i32> %tmp3
28}
29
30define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
31;CHECK-LABEL: subhn2_16b:
32;CHECK: subhn.8b
33;CHECK-NEXT: subhn2.16b
34  %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
35  %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
36  %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
37  ret <16 x i8> %res
38}
39
40define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
41;CHECK-LABEL: subhn2_8h:
42;CHECK: subhn.4h
43;CHECK-NEXT: subhn2.8h
44  %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
45  %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
46  %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
47  ret <8 x i16> %res
48}
49
50define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
51;CHECK-LABEL: subhn2_4s:
52;CHECK: subhn.2s
53;CHECK-NEXT: subhn2.4s
54  %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
55  %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
56  %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
57  ret <4 x i32> %res
58}
59
60declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
61declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
62declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
63
64define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
65;CHECK-LABEL: rsubhn8b:
66;CHECK: rsubhn.8b
67        %tmp1 = load <8 x i16>, <8 x i16>* %A
68        %tmp2 = load <8 x i16>, <8 x i16>* %B
69        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
70        ret <8 x i8> %tmp3
71}
72
73define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
74;CHECK-LABEL: rsubhn4h:
75;CHECK: rsubhn.4h
76        %tmp1 = load <4 x i32>, <4 x i32>* %A
77        %tmp2 = load <4 x i32>, <4 x i32>* %B
78        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
79        ret <4 x i16> %tmp3
80}
81
82define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
83;CHECK-LABEL: rsubhn2s:
84;CHECK: rsubhn.2s
85        %tmp1 = load <2 x i64>, <2 x i64>* %A
86        %tmp2 = load <2 x i64>, <2 x i64>* %B
87        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
88        ret <2 x i32> %tmp3
89}
90
91define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
92;CHECK-LABEL: rsubhn2_16b:
93;CHECK: rsubhn.8b
94;CHECK-NEXT: rsubhn2.16b
95  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
96  %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
97  %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
98  ret <16 x i8> %res
99}
100
101define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
102;CHECK-LABEL: rsubhn2_8h:
103;CHECK: rsubhn.4h
104;CHECK-NEXT: rsubhn2.8h
105  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
106  %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
107  %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
108  ret <8 x i16> %res
109}
110
111define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
112;CHECK-LABEL: rsubhn2_4s:
113;CHECK: rsubhn.2s
114;CHECK-NEXT: rsubhn2.4s
115  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
116  %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
117  %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
118  ret <4 x i32> %res
119}
120
121declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
122declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
123declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
124
125define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
126;CHECK-LABEL: ssubl8h:
127;CHECK: ssubl.8h
128        %tmp1 = load <8 x i8>, <8 x i8>* %A
129        %tmp2 = load <8 x i8>, <8 x i8>* %B
130  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
131  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
132  %tmp5 = sub <8 x i16> %tmp3, %tmp4
133        ret <8 x i16> %tmp5
134}
135
136define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
137;CHECK-LABEL: ssubl4s:
138;CHECK: ssubl.4s
139        %tmp1 = load <4 x i16>, <4 x i16>* %A
140        %tmp2 = load <4 x i16>, <4 x i16>* %B
141  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
142  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
143  %tmp5 = sub <4 x i32> %tmp3, %tmp4
144        ret <4 x i32> %tmp5
145}
146
147define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
148;CHECK-LABEL: ssubl2d:
149;CHECK: ssubl.2d
150        %tmp1 = load <2 x i32>, <2 x i32>* %A
151        %tmp2 = load <2 x i32>, <2 x i32>* %B
152  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
153  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
154  %tmp5 = sub <2 x i64> %tmp3, %tmp4
155        ret <2 x i64> %tmp5
156}
157
158define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
159;CHECK-LABEL: ssubl2_8h:
160;CHECK: ssubl.8h
161        %tmp1 = load <16 x i8>, <16 x i8>* %A
162        %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
163        %ext1 = sext <8 x i8> %high1 to <8 x i16>
164
165        %tmp2 = load <16 x i8>, <16 x i8>* %B
166        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
167        %ext2 = sext <8 x i8> %high2 to <8 x i16>
168
169        %res = sub <8 x i16> %ext1, %ext2
170        ret <8 x i16> %res
171}
172
173define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
174;CHECK-LABEL: ssubl2_4s:
175;CHECK: ssubl.4s
176        %tmp1 = load <8 x i16>, <8 x i16>* %A
177        %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
178        %ext1 = sext <4 x i16> %high1 to <4 x i32>
179
180        %tmp2 = load <8 x i16>, <8 x i16>* %B
181        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
182        %ext2 = sext <4 x i16> %high2 to <4 x i32>
183
184        %res = sub <4 x i32> %ext1, %ext2
185        ret <4 x i32> %res
186}
187
188define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
189;CHECK-LABEL: ssubl2_2d:
190;CHECK: ssubl.2d
191        %tmp1 = load <4 x i32>, <4 x i32>* %A
192        %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
193        %ext1 = sext <2 x i32> %high1 to <2 x i64>
194
195        %tmp2 = load <4 x i32>, <4 x i32>* %B
196        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
197        %ext2 = sext <2 x i32> %high2 to <2 x i64>
198
199        %res = sub <2 x i64> %ext1, %ext2
200        ret <2 x i64> %res
201}
202
203define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
204;CHECK-LABEL: usubl8h:
205;CHECK: usubl.8h
206  %tmp1 = load <8 x i8>, <8 x i8>* %A
207  %tmp2 = load <8 x i8>, <8 x i8>* %B
208  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
209  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
210  %tmp5 = sub <8 x i16> %tmp3, %tmp4
211  ret <8 x i16> %tmp5
212}
213
214define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
215;CHECK-LABEL: usubl4s:
216;CHECK: usubl.4s
217  %tmp1 = load <4 x i16>, <4 x i16>* %A
218  %tmp2 = load <4 x i16>, <4 x i16>* %B
219  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
220  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
221  %tmp5 = sub <4 x i32> %tmp3, %tmp4
222  ret <4 x i32> %tmp5
223}
224
225define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
226;CHECK-LABEL: usubl2d:
227;CHECK: usubl.2d
228  %tmp1 = load <2 x i32>, <2 x i32>* %A
229  %tmp2 = load <2 x i32>, <2 x i32>* %B
230  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
231  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
232  %tmp5 = sub <2 x i64> %tmp3, %tmp4
233  ret <2 x i64> %tmp5
234}
235
236define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
237;CHECK-LABEL: usubl2_8h:
238;CHECK: usubl.8h
239  %tmp1 = load <16 x i8>, <16 x i8>* %A
240  %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
241  %ext1 = zext <8 x i8> %high1 to <8 x i16>
242
243  %tmp2 = load <16 x i8>, <16 x i8>* %B
244  %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
245  %ext2 = zext <8 x i8> %high2 to <8 x i16>
246
247  %res = sub <8 x i16> %ext1, %ext2
248  ret <8 x i16> %res
249}
250
251define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
252;CHECK-LABEL: usubl2_4s:
253;CHECK: usubl.4s
254  %tmp1 = load <8 x i16>, <8 x i16>* %A
255  %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
256  %ext1 = zext <4 x i16> %high1 to <4 x i32>
257
258  %tmp2 = load <8 x i16>, <8 x i16>* %B
259  %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
260  %ext2 = zext <4 x i16> %high2 to <4 x i32>
261
262  %res = sub <4 x i32> %ext1, %ext2
263  ret <4 x i32> %res
264}
265
266define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
267;CHECK-LABEL: usubl2_2d:
268;CHECK: usubl.2d
269  %tmp1 = load <4 x i32>, <4 x i32>* %A
270  %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
271  %ext1 = zext <2 x i32> %high1 to <2 x i64>
272
273  %tmp2 = load <4 x i32>, <4 x i32>* %B
274  %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
275  %ext2 = zext <2 x i32> %high2 to <2 x i64>
276
277  %res = sub <2 x i64> %ext1, %ext2
278  ret <2 x i64> %res
279}
280
281define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
282;CHECK-LABEL: ssubw8h:
283;CHECK: ssubw.8h
284        %tmp1 = load <8 x i16>, <8 x i16>* %A
285        %tmp2 = load <8 x i8>, <8 x i8>* %B
286  %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
287  %tmp4 = sub <8 x i16> %tmp1, %tmp3
288        ret <8 x i16> %tmp4
289}
290
291define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
292;CHECK-LABEL: ssubw4s:
293;CHECK: ssubw.4s
294        %tmp1 = load <4 x i32>, <4 x i32>* %A
295        %tmp2 = load <4 x i16>, <4 x i16>* %B
296  %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
297  %tmp4 = sub <4 x i32> %tmp1, %tmp3
298        ret <4 x i32> %tmp4
299}
300
301define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
302;CHECK-LABEL: ssubw2d:
303;CHECK: ssubw.2d
304        %tmp1 = load <2 x i64>, <2 x i64>* %A
305        %tmp2 = load <2 x i32>, <2 x i32>* %B
306  %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
307  %tmp4 = sub <2 x i64> %tmp1, %tmp3
308        ret <2 x i64> %tmp4
309}
310
311define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
312;CHECK-LABEL: ssubw2_8h:
313;CHECK: ssubw.8h
314        %tmp1 = load <8 x i16>, <8 x i16>* %A
315
316        %tmp2 = load <16 x i8>, <16 x i8>* %B
317        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
318        %ext2 = sext <8 x i8> %high2 to <8 x i16>
319
320        %res = sub <8 x i16> %tmp1, %ext2
321        ret <8 x i16> %res
322}
323
324define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
325;CHECK-LABEL: ssubw2_4s:
326;CHECK: ssubw.4s
327        %tmp1 = load <4 x i32>, <4 x i32>* %A
328
329        %tmp2 = load <8 x i16>, <8 x i16>* %B
330        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
331        %ext2 = sext <4 x i16> %high2 to <4 x i32>
332
333        %res = sub <4 x i32> %tmp1, %ext2
334        ret <4 x i32> %res
335}
336
337define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
338;CHECK-LABEL: ssubw2_2d:
339;CHECK: ssubw.2d
340        %tmp1 = load <2 x i64>, <2 x i64>* %A
341
342        %tmp2 = load <4 x i32>, <4 x i32>* %B
343        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
344        %ext2 = sext <2 x i32> %high2 to <2 x i64>
345
346        %res = sub <2 x i64> %tmp1, %ext2
347        ret <2 x i64> %res
348}
349
350define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
351;CHECK-LABEL: usubw8h:
352;CHECK: usubw.8h
353        %tmp1 = load <8 x i16>, <8 x i16>* %A
354        %tmp2 = load <8 x i8>, <8 x i8>* %B
355  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
356  %tmp4 = sub <8 x i16> %tmp1, %tmp3
357        ret <8 x i16> %tmp4
358}
359
360define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
361;CHECK-LABEL: usubw4s:
362;CHECK: usubw.4s
363        %tmp1 = load <4 x i32>, <4 x i32>* %A
364        %tmp2 = load <4 x i16>, <4 x i16>* %B
365  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
366  %tmp4 = sub <4 x i32> %tmp1, %tmp3
367        ret <4 x i32> %tmp4
368}
369
370define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
371;CHECK-LABEL: usubw2d:
372;CHECK: usubw.2d
373        %tmp1 = load <2 x i64>, <2 x i64>* %A
374        %tmp2 = load <2 x i32>, <2 x i32>* %B
375  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
376  %tmp4 = sub <2 x i64> %tmp1, %tmp3
377        ret <2 x i64> %tmp4
378}
379
380define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
381;CHECK-LABEL: usubw2_8h:
382;CHECK: usubw.8h
383        %tmp1 = load <8 x i16>, <8 x i16>* %A
384
385        %tmp2 = load <16 x i8>, <16 x i8>* %B
386        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
387        %ext2 = zext <8 x i8> %high2 to <8 x i16>
388
389        %res = sub <8 x i16> %tmp1, %ext2
390        ret <8 x i16> %res
391}
392
393define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
394;CHECK-LABEL: usubw2_4s:
395;CHECK: usubw.4s
396        %tmp1 = load <4 x i32>, <4 x i32>* %A
397
398        %tmp2 = load <8 x i16>, <8 x i16>* %B
399        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
400        %ext2 = zext <4 x i16> %high2 to <4 x i32>
401
402        %res = sub <4 x i32> %tmp1, %ext2
403        ret <4 x i32> %res
404}
405
406define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
407;CHECK-LABEL: usubw2_2d:
408;CHECK: usubw.2d
409        %tmp1 = load <2 x i64>, <2 x i64>* %A
410
411        %tmp2 = load <4 x i32>, <4 x i32>* %B
412        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
413        %ext2 = zext <2 x i32> %high2 to <2 x i64>
414
415        %res = sub <2 x i64> %tmp1, %ext2
416        ret <2 x i64> %res
417}
418