1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s
2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s
5
6; GCN-LABEL: {{^}}store_global_hi_v2i16:
7; GCN: s_waitcnt
8
9; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
10
11; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
12; GFX803-NEXT: flat_store_short v[0:1], v2
13; GFX906-NEXT: global_store_short v[0:1], v2, off
14
15; GCN-NEXT: s_waitcnt
16; GCN-NEXT: s_setpc_b64
17define void @store_global_hi_v2i16(i16 addrspace(1)* %out, i32 %arg) #0 {
18entry:
19  ; FIXME: ABI for pre-gfx9
20  %value = bitcast i32 %arg to <2 x i16>
21  %hi = extractelement <2 x i16> %value, i32 1
22  store i16 %hi, i16 addrspace(1)* %out
23  ret void
24}
25
26; GCN-LABEL: {{^}}store_global_hi_v2f16:
27; GCN: s_waitcnt
28
29; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
30
31; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
32; GFX803-NEXT: flat_store_short v[0:1], v2
33; GFX906-NEXT: global_store_short v[0:1], v2, off
34
35; GCN-NEXT: s_waitcnt
36; GCN-NEXT: s_setpc_b64
37define void @store_global_hi_v2f16(half addrspace(1)* %out, i32 %arg) #0 {
38entry:
39  ; FIXME: ABI for pre-gfx9
40  %value = bitcast i32 %arg to <2 x half>
41  %hi = extractelement <2 x half> %value, i32 1
42  store half %hi, half addrspace(1)* %out
43  ret void
44}
45
46; GCN-LABEL: {{^}}store_global_hi_i32_shift:
47; GCN: s_waitcnt
48
49; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
50
51; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
52; GFX803-NEXT: flat_store_short v[0:1], v2
53; GFX906-NEXT: global_store_short v[0:1], v2, off
54
55; GCN-NEXT: s_waitcnt
56; GCN-NEXT: s_setpc_b64
57define void @store_global_hi_i32_shift(i16 addrspace(1)* %out, i32 %value) #0 {
58entry:
59  %hi32 = lshr i32 %value, 16
60  %hi = trunc i32 %hi32 to i16
61  store i16 %hi, i16 addrspace(1)* %out
62  ret void
63}
64
65; GCN-LABEL: {{^}}store_global_hi_v2i16_i8:
66; GCN: s_waitcnt
67
68; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off
69
70; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
71; GFX803-NEXT: flat_store_byte v[0:1], v2
72; GFX906-NEXT: global_store_byte v[0:1], v2, off
73
74; GCN-NEXT: s_waitcnt
75; GCN-NEXT: s_setpc_b64
76define void @store_global_hi_v2i16_i8(i8 addrspace(1)* %out, i32 %arg) #0 {
77entry:
78  %value = bitcast i32 %arg to <2 x i16>
79  %hi = extractelement <2 x i16> %value, i32 1
80  %trunc = trunc i16 %hi to i8
81  store i8 %trunc, i8 addrspace(1)* %out
82  ret void
83}
84
85; GCN-LABEL: {{^}}store_global_hi_i8_shift:
86; GCN: s_waitcnt
87
88; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off
89
90; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
91; GFX803-NEXT: flat_store_byte v[0:1], v2
92; GFX906-NEXT: global_store_byte v[0:1], v2, off
93
94; GCN-NEXT: s_waitcnt
95; GCN-NEXT: s_setpc_b64
96define void @store_global_hi_i8_shift(i8 addrspace(1)* %out, i32 %value) #0 {
97entry:
98  %hi32 = lshr i32 %value, 16
99  %hi = trunc i32 %hi32 to i8
100  store i8 %hi, i8 addrspace(1)* %out
101  ret void
102}
103
104; GCN-LABEL: {{^}}store_global_hi_v2i16_max_offset:
105; GCN: s_waitcnt
106; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094
107
108; GFX803-DAG: v_add_u32_e32
109; GFX803-DAG: v_addc_u32_e32
110; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
111; GFX803: flat_store_short v[0:1], v2{{$}}
112
113; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
114; GFX906-NEXT: global_store_short v[0:1], v2, off
115
116; GCN-NEXT: s_waitcnt
117; GCN-NEXT: s_setpc_b64
118define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
119entry:
120  ; FIXME: ABI for pre-gfx9
121  %value = bitcast i32 %arg to <2 x i16>
122  %hi = extractelement <2 x i16> %value, i32 1
123  %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 2047
124  store i16 %hi, i16 addrspace(1)* %gep
125  ret void
126}
127
128; GCN-LABEL: {{^}}store_global_hi_v2i16_min_offset:
129; GCN: s_waitcnt
130; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}}
131
132; GFX803-DAG: v_add_u32_e32
133; GFX803-DAG: v_addc_u32_e32
134; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
135; GFX803: flat_store_short v[0:1], v{{[0-9]$}}
136
137; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
138; GFX906-NEXT: global_store_short v[0:1], v2, off
139
140; GCN-NEXT: s_waitcnt
141; GCN-NEXT: s_setpc_b64
142define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
143entry:
144  %value = bitcast i32 %arg to <2 x i16>
145  %hi = extractelement <2 x i16> %value, i32 1
146  %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 -2048
147  store i16 %hi, i16 addrspace(1)* %gep
148  ret void
149}
150
151; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_max_offset:
152; GCN: s_waitcnt
153; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095
154
155; GFX803-DAG: v_add_u32_e32
156; GFX803-DAG: v_addc_u32_e32
157; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
158; GFX803: flat_store_byte v[0:1], v{{[0-9]$}}
159
160; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
161; GFX906-NEXT: global_store_byte v[0:1], v2, off
162
163; GCN-NEXT: s_waitcnt
164; GCN-NEXT: s_setpc_b64
165define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
166entry:
167  %value = bitcast i32 %arg to <2 x i16>
168  %hi = extractelement <2 x i16> %value, i32 1
169  %trunc = trunc i16 %hi to i8
170  %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4095
171  store i8 %trunc, i8 addrspace(1)* %gep
172  ret void
173}
174
175; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_min_offset:
176; GCN: s_waitcnt
177; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095
178
179; GFX803-DAG: v_add_u32_e32
180; GFX803-DAG: v_addc_u32_e32
181; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
182; GFX803: flat_store_byte v[0:1], v{{[0-9]$}}
183
184; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
185; GFX906-NEXT: global_store_byte v[0:1], v2, off
186
187; GCN-NEXT: s_waitcnt
188; GCN-NEXT: s_setpc_b64
189define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
190entry:
191  %value = bitcast i32 %arg to <2 x i16>
192  %hi = extractelement <2 x i16> %value, i32 1
193  %trunc = trunc i16 %hi to i8
194  %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 -4095
195  store i8 %trunc, i8 addrspace(1)* %gep
196  ret void
197}
198
199; GCN-LABEL: {{^}}store_flat_hi_v2i16:
200; GCN: s_waitcnt
201
202; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
203
204; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
205; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
206
207; GCN-NEXT: s_waitcnt
208; GCN-NEXT: s_setpc_b64
209define void @store_flat_hi_v2i16(i16* %out, i32 %arg) #0 {
210entry:
211  %value = bitcast i32 %arg to <2 x i16>
212  %hi = extractelement <2 x i16> %value, i32 1
213  store i16 %hi, i16* %out
214  ret void
215}
216
217; GCN-LABEL: {{^}}store_flat_hi_v2f16:
218; GCN: s_waitcnt
219
220; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
221
222; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
223; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
224
225; GCN-NEXT: s_waitcnt
226; GCN-NEXT: s_setpc_b64
227define void @store_flat_hi_v2f16(half* %out, i32 %arg) #0 {
228entry:
229  %value = bitcast i32 %arg to <2 x half>
230  %hi = extractelement <2 x half> %value, i32 1
231  store half %hi, half* %out
232  ret void
233}
234
235; GCN-LABEL: {{^}}store_flat_hi_i32_shift:
236; GCN: s_waitcnt
237
238; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
239
240; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
241; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
242
243; GCN-NEXT: s_waitcnt
244; GCN-NEXT: s_setpc_b64
245define void @store_flat_hi_i32_shift(i16* %out, i32 %value) #0 {
246entry:
247  %hi32 = lshr i32 %value, 16
248  %hi = trunc i32 %hi32 to i16
249  store i16 %hi, i16* %out
250  ret void
251}
252
253; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8:
254; GCN: s_waitcnt
255
256; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
257
258; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
259; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2
260
261; GCN-NEXT: s_waitcnt
262; GCN-NEXT: s_setpc_b64
263define void @store_flat_hi_v2i16_i8(i8* %out, i32 %arg) #0 {
264entry:
265  %value = bitcast i32 %arg to <2 x i16>
266  %hi = extractelement <2 x i16> %value, i32 1
267  %trunc = trunc i16 %hi to i8
268  store i8 %trunc, i8* %out
269  ret void
270}
271
272; GCN-LABEL: {{^}}store_flat_hi_i8_shift:
273; GCN: s_waitcnt
274
275; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
276
277; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
278; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2
279
280; GCN-NEXT: s_waitcnt
281; GCN-NEXT: s_setpc_b64
282define void @store_flat_hi_i8_shift(i8* %out, i32 %value) #0 {
283entry:
284  %hi32 = lshr i32 %value, 16
285  %hi = trunc i32 %hi32 to i8
286  store i8 %hi, i8* %out
287  ret void
288}
289
290; GCN-LABEL: {{^}}store_flat_hi_v2i16_max_offset:
291; GCN: s_waitcnt
292; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}}
293
294; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
295; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094
296
297; GFX803-DAG: v_add_u32_e32
298; GFX803-DAG: v_addc_u32_e32
299; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
300; GFX803: flat_store_short v[0:1], v2{{$}}
301
302; GCN-NEXT: s_waitcnt
303; GCN-NEXT: s_setpc_b64
304define void @store_flat_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 {
305entry:
306  %value = bitcast i32 %arg to <2 x i16>
307  %hi = extractelement <2 x i16> %value, i32 1
308  %gep = getelementptr inbounds i16, i16* %out, i64 2047
309  store i16 %hi, i16* %gep
310  ret void
311}
312
313; GCN-LABEL: {{^}}store_flat_hi_v2i16_neg_offset:
314; GCN: s_waitcnt
315; GFX803: v_add{{(_co)?}}_{{i|u}}32_e32
316; GFX803: v_addc_u32_e32
317
318; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff802, v
319; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v
320
321; GFX906-DAG: v_lshrrev_b32_e32
322; GFX906: flat_store_short v[0:1], v2{{$}}
323
324; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
325; GFX803: flat_store_short v[0:1], v2{{$}}
326; GCN-NEXT: s_waitcnt
327; GCN-NEXT: s_setpc_b64
328define void @store_flat_hi_v2i16_neg_offset(i16* %out, i32 %arg) #0 {
329entry:
330  %value = bitcast i32 %arg to <2 x i16>
331  %hi = extractelement <2 x i16> %value, i32 1
332  %gep = getelementptr inbounds i16, i16* %out, i64 -1023
333  store i16 %hi, i16* %gep
334  ret void
335}
336
337; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_max_offset:
338; GCN: s_waitcnt
339; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}}
340
341; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
342; GFX803-DAG: v_add_u32_e32
343; GFX803-DAG: v_addc_u32_e32
344; GFX803: flat_store_byte v[0:1], v2{{$}}
345
346; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
347; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}}
348
349; GCN-NEXT: s_waitcnt
350; GCN-NEXT: s_setpc_b64
351define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 {
352entry:
353  %value = bitcast i32 %arg to <2 x i16>
354  %hi = extractelement <2 x i16> %value, i32 1
355  %trunc = trunc i16 %hi to i8
356  %gep = getelementptr inbounds i8, i8* %out, i64 4095
357  store i8 %trunc, i8* %gep
358  ret void
359}
360
361; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_neg_offset:
362; GCN: s_waitcnt
363
364; GFX803-DAG: v_add_u32_e32
365; GFX803-DAG: v_addc_u32_e32
366
367; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff001, v
368; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v{{[0-9]+}}, vcc
369
370; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
371
372; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2
373; GFX906: flat_store_byte v[0:1], v2{{$}}
374
375; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
376; GFX803: flat_store_byte v[0:1], v2{{$}}
377
378; GCN-NEXT: s_waitcnt
379; GCN-NEXT: s_setpc_b64
380define void @store_flat_hi_v2i16_i8_neg_offset(i8* %out, i32 %arg) #0 {
381entry:
382  %value = bitcast i32 %arg to <2 x i16>
383  %hi = extractelement <2 x i16> %value, i32 1
384  %trunc = trunc i16 %hi to i8
385  %gep = getelementptr inbounds i8, i8* %out, i64 -4095
386  store i8 %trunc, i8* %gep
387  ret void
388}
389
390; GCN-LABEL: {{^}}store_private_hi_v2i16:
391; GCN: s_waitcnt
392
393; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
394; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off
395
396; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
397; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
398
399; GCN-NEXT: s_waitcnt
400; GCN-NEXT: s_setpc_b64
401define void @store_private_hi_v2i16(i16 addrspace(5)* %out, i32 %arg) #0 {
402entry:
403  ; FIXME: ABI for pre-gfx9
404  %value = bitcast i32 %arg to <2 x i16>
405  %hi = extractelement <2 x i16> %value, i32 1
406  store i16 %hi, i16 addrspace(5)* %out
407  ret void
408}
409
410; GCN-LABEL: {{^}}store_private_hi_v2f16:
411; GCN: s_waitcnt
412
413; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
414; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}
415
416; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
417; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
418
419; GCN-NEXT: s_waitcnt
420; GCN-NEXT: s_setpc_b64
421define void @store_private_hi_v2f16(half addrspace(5)* %out, i32 %arg) #0 {
422entry:
423  ; FIXME: ABI for pre-gfx9
424  %value = bitcast i32 %arg to <2 x half>
425  %hi = extractelement <2 x half> %value, i32 1
426  store half %hi, half addrspace(5)* %out
427  ret void
428}
429
430; GCN-LABEL: {{^}}store_private_hi_i32_shift:
431; GCN: s_waitcnt
432
433; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
434; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}
435
436; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
437; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
438
439; GCN-NEXT: s_waitcnt
440; GCN-NEXT: s_setpc_b64
441define void @store_private_hi_i32_shift(i16 addrspace(5)* %out, i32 %value) #0 {
442entry:
443  %hi32 = lshr i32 %value, 16
444  %hi = trunc i32 %hi32 to i16
445  store i16 %hi, i16 addrspace(5)* %out
446  ret void
447}
448
449; GCN-LABEL: {{^}}store_private_hi_v2i16_i8:
450; GCN: s_waitcnt
451
452; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
453; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}
454
455; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
456; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
457
458; GCN-NEXT: s_waitcnt
459; GCN-NEXT: s_setpc_b64
460define void @store_private_hi_v2i16_i8(i8 addrspace(5)* %out, i32 %arg) #0 {
461entry:
462  %value = bitcast i32 %arg to <2 x i16>
463  %hi = extractelement <2 x i16> %value, i32 1
464  %trunc = trunc i16 %hi to i8
465  store i8 %trunc, i8 addrspace(5)* %out
466  ret void
467}
468
469; GCN-LABEL: {{^}}store_private_hi_i8_shift:
470; GCN: s_waitcnt
471
472; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
473; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}
474
475; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
476; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
477
478; GCN-NEXT: s_waitcnt
479; GCN-NEXT: s_setpc_b64
480define void @store_private_hi_i8_shift(i8 addrspace(5)* %out, i32 %value) #0 {
481entry:
482  %hi32 = lshr i32 %value, 16
483  %hi = trunc i32 %hi32 to i8
484  store i8 %hi, i8 addrspace(5)* %out
485  ret void
486}
487
488; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset:
489; GCN: s_waitcnt
490; GFX900-MUBUF:   buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
491; GFX900-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}}
492
493; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
494; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}}
495
496; GCN-NEXT: s_waitcnt
497; GCN-NEXT: s_setpc_b64
498define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval(i16) %out, i32 %arg) #0 {
499entry:
500  %value = bitcast i32 %arg to <2 x i16>
501  %hi = extractelement <2 x i16> %value, i32 1
502  %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2047
503  store i16 %hi, i16 addrspace(5)* %gep
504  ret void
505}
506
507
508
509; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff:
510; GCN: s_waitcnt
511
512; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}}
513; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
514; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}}
515
516; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
517; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}}
518
519; GCN-NEXT: s_waitcnt
520; GCN-NEXT: s_setpc_b64
521define void @store_private_hi_v2i16_nooff(i32 %arg) #0 {
522entry:
523  ; FIXME: ABI for pre-gfx9
524  %value = bitcast i32 %arg to <2 x i16>
525  %hi = extractelement <2 x i16> %value, i32 1
526  store volatile i16 %hi, i16 addrspace(5)* null
527  ret void
528}
529
530
531; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff:
532; GCN: s_waitcnt
533
534; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}}
535; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
536; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}}
537
538; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
539; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}}
540
541; GCN-NEXT: s_waitcnt
542; GCN-NEXT: s_setpc_b64
543define void @store_private_hi_v2i16_i8_nooff(i32 %arg) #0 {
544entry:
545  %value = bitcast i32 %arg to <2 x i16>
546  %hi = extractelement <2 x i16> %value, i32 1
547  %trunc = trunc i16 %hi to i8
548  store volatile i8 %trunc, i8 addrspace(5)* null
549  ret void
550}
551
552; GCN-LABEL: {{^}}store_local_hi_v2i16:
553; GCN: s_waitcnt
554
555; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
556
557; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
558; NO-D16-HI: ds_write_b16 v0, v1
559
560; GCN-NEXT: s_waitcnt
561; GCN-NEXT: s_setpc_b64
562define void @store_local_hi_v2i16(i16 addrspace(3)* %out, i32 %arg) #0 {
563entry:
564  ; FIXME: ABI for pre-gfx9
565  %value = bitcast i32 %arg to <2 x i16>
566  %hi = extractelement <2 x i16> %value, i32 1
567  store i16 %hi, i16 addrspace(3)* %out
568  ret void
569}
570
571; GCN-LABEL: {{^}}store_local_hi_v2f16:
572; GCN: s_waitcnt
573
574; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
575
576; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
577; NO-D16-HI: ds_write_b16 v0, v1
578
579; GCN-NEXT: s_waitcnt
580; GCN-NEXT: s_setpc_b64
581define void @store_local_hi_v2f16(half addrspace(3)* %out, i32 %arg) #0 {
582entry:
583  ; FIXME: ABI for pre-gfx9
584  %value = bitcast i32 %arg to <2 x half>
585  %hi = extractelement <2 x half> %value, i32 1
586  store half %hi, half addrspace(3)* %out
587  ret void
588}
589
590; GCN-LABEL: {{^}}store_local_hi_i32_shift:
591; GCN: s_waitcnt
592
593; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
594
595; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
596; NO-D16-HI: ds_write_b16 v0, v1
597
598; GCN-NEXT: s_waitcnt
599; GCN-NEXT: s_setpc_b64
600define void @store_local_hi_i32_shift(i16 addrspace(3)* %out, i32 %value) #0 {
601entry:
602  %hi32 = lshr i32 %value, 16
603  %hi = trunc i32 %hi32 to i16
604  store i16 %hi, i16 addrspace(3)* %out
605  ret void
606}
607
608; GCN-LABEL: {{^}}store_local_hi_v2i16_i8:
609; GCN: s_waitcnt
610
611; GFX900-NEXT: ds_write_b8_d16_hi v0, v1{{$}}
612
613; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
614; NO-D16-HI: ds_write_b8 v0, v1
615
616; GCN-NEXT: s_waitcnt
617; GCN-NEXT: s_setpc_b64
618define void @store_local_hi_v2i16_i8(i8 addrspace(3)* %out, i32 %arg) #0 {
619entry:
620  %value = bitcast i32 %arg to <2 x i16>
621  %hi = extractelement <2 x i16> %value, i32 1
622  %trunc = trunc i16 %hi to i8
623  store i8 %trunc, i8 addrspace(3)* %out
624  ret void
625}
626
627; GCN-LABEL: {{^}}store_local_hi_v2i16_max_offset:
628; GCN: s_waitcnt
629; GFX900-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}}
630
631; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
632; NO-D16-HI: ds_write_b16 v0, v1 offset:65534{{$}}
633
634; GCN-NEXT: s_waitcnt
635; GCN-NEXT: s_setpc_b64
636define void @store_local_hi_v2i16_max_offset(i16 addrspace(3)* %out, i32 %arg) #0 {
637entry:
638  ; FIXME: ABI for pre-gfx9
639  %value = bitcast i32 %arg to <2 x i16>
640  %hi = extractelement <2 x i16> %value, i32 1
641  %gep = getelementptr inbounds i16, i16 addrspace(3)* %out, i64 32767
642  store i16 %hi, i16 addrspace(3)* %gep
643  ret void
644}
645
646; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
647; GCN: s_waitcnt
648; GFX900-MUBUF:        buffer_store_dword
649; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094
650; GFX900-FLATSCR:      scratch_store_dword
651; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094
652define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
653entry:
654  %obj0 = alloca [10 x i32], align 4, addrspace(5)
655  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
656  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
657  store volatile i32 123, i32 addrspace(5)* %bc
658  %value = bitcast i32 %arg to <2 x i16>
659  %hi = extractelement <2 x i16> %value, i32 1
660  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
661  store i16 %hi, i16 addrspace(5)* %gep
662  ret void
663}
664
665; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
666; GCN: s_waitcnt
667; GFX900-MUBUF:        buffer_store_dword
668; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095
669; GFX900-FLATSCR:      scratch_store_dword
670; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095
671define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
672entry:
673  %obj0 = alloca [10 x i32], align 4, addrspace(5)
674  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
675  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
676  store volatile i32 123, i32 addrspace(5)* %bc
677  %value = bitcast i32 %arg to <2 x i16>
678  %hi = extractelement <2 x i16> %value, i32 1
679  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
680  %trunc = trunc i16 %hi to i8
681  store i8 %trunc, i8 addrspace(5)* %gep
682  ret void
683}
684
685attributes #0 = { nounwind }
686