1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4
5; FUNC-LABEL: {{^}}udiv24_i8:
6; SI: v_cvt_f32_ubyte
7; SI: v_cvt_f32_ubyte
8; SI: v_rcp_iflag_f32
9; SI: v_cvt_u32_f32
10
11; EG: UINT_TO_FLT
12; EG-DAG: UINT_TO_FLT
13; EG-DAG: RECIP_IEEE
14; EG: FLT_TO_UINT
15define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
16  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
17  %num = load i8, i8 addrspace(1) * %in
18  %den = load i8, i8 addrspace(1) * %den_ptr
19  %result = udiv i8 %num, %den
20  store i8 %result, i8 addrspace(1)* %out
21  ret void
22}
23
24; FUNC-LABEL: {{^}}udiv24_i16:
25; SI: v_cvt_f32_u32
26; SI: v_cvt_f32_u32
27; SI: v_rcp_iflag_f32
28; SI: v_cvt_u32_f32
29
30; EG: UINT_TO_FLT
31; EG-DAG: UINT_TO_FLT
32; EG-DAG: RECIP_IEEE
33; EG: FLT_TO_UINT
34define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
35  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
36  %num = load i16, i16 addrspace(1) * %in, align 2
37  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
38  %result = udiv i16 %num, %den
39  store i16 %result, i16 addrspace(1)* %out, align 2
40  ret void
41}
42
43; FUNC-LABEL: {{^}}udiv23_i32:
44; SI: v_cvt_f32_u32
45; SI-DAG: v_cvt_f32_u32
46; SI-DAG: v_rcp_iflag_f32
47; SI: v_cvt_u32_f32
48
49; EG: UINT_TO_FLT
50; EG-DAG: UINT_TO_FLT
51; EG-DAG: RECIP_IEEE
52; EG: FLT_TO_UINT
53define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
54  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
55  %num = load i32, i32 addrspace(1) * %in, align 4
56  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
57  %num.i23.0 = shl i32 %num, 9
58  %den.i23.0 = shl i32 %den, 9
59  %num.i23 = lshr i32 %num.i23.0, 9
60  %den.i23 = lshr i32 %den.i23.0, 9
61  %result = udiv i32 %num.i23, %den.i23
62  store i32 %result, i32 addrspace(1)* %out, align 4
63  ret void
64}
65
66; FUNC-LABEL: {{^}}udiv24_i32:
67; SI: v_rcp_iflag
68; SI-NOT v_rcp_f32
69; EG-NOT: RECIP_IEEE
70define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
71  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
72  %num = load i32, i32 addrspace(1) * %in, align 4
73  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
74  %num.i24.0 = shl i32 %num, 8
75  %den.i24.0 = shl i32 %den, 8
76  %num.i24 = lshr i32 %num.i24.0, 8
77  %den.i24 = lshr i32 %den.i24.0, 8
78  %result = udiv i32 %num.i24, %den.i24
79  store i32 %result, i32 addrspace(1)* %out, align 4
80  ret void
81}
82
83; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32:
84; SI: v_rcp_iflag
85; SI-NOT v_rcp_f32
86; EG-NOT: RECIP_IEEE
87define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
88  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
89  %num = load i32, i32 addrspace(1) * %in, align 4
90  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
91  %num.i23.0 = shl i32 %num, 9
92  %den.i24.0 = shl i32 %den, 8
93  %num.i23 = lshr i32 %num.i23.0, 9
94  %den.i24 = lshr i32 %den.i24.0, 8
95  %result = udiv i32 %num.i23, %den.i24
96  store i32 %result, i32 addrspace(1)* %out, align 4
97  ret void
98}
99
100; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32:
101; SI: v_rcp_iflag
102; SI-NOT v_rcp_f32
103; EG-NOT: RECIP_IEEE
104define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
105  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
106  %num = load i32, i32 addrspace(1) * %in, align 4
107  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
108  %num.i24.0 = shl i32 %num, 8
109  %den.i23.0 = shl i32 %den, 9
110  %num.i24 = lshr i32 %num.i24.0, 8
111  %den.i23 = lshr i32 %den.i23.0, 9
112  %result = udiv i32 %num.i24, %den.i23
113  store i32 %result, i32 addrspace(1)* %out, align 4
114  ret void
115}
116
117; FUNC-LABEL: {{^}}udiv25_i32:
118; RCP_IFLAG is for URECIP in the full 32b alg
119; SI: v_rcp_iflag
120; SI-NOT: v_rcp_f32
121
122; EG-NOT: UINT_TO_FLT
123; EG-NOT: RECIP_IEEE
124define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
125  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
126  %num = load i32, i32 addrspace(1) * %in, align 4
127  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
128  %num.i25.0 = shl i32 %num, 7
129  %den.i25.0 = shl i32 %den, 7
130  %num.i25 = lshr i32 %num.i25.0, 7
131  %den.i25 = lshr i32 %den.i25.0, 7
132  %result = udiv i32 %num.i25, %den.i25
133  store i32 %result, i32 addrspace(1)* %out, align 4
134  ret void
135}
136
137; FUNC-LABEL: {{^}}test_no_udiv24_i32_1:
138; RCP_IFLAG is for URECIP in the full 32b alg
139; SI: v_rcp_iflag
140; SI-NOT: v_rcp_f32
141
142; EG-NOT: UINT_TO_FLT
143; EG-NOT: RECIP_IEEE
144define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
145  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
146  %num = load i32, i32 addrspace(1) * %in, align 4
147  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
148  %num.i24.0 = shl i32 %num, 8
149  %den.i24.0 = shl i32 %den, 7
150  %num.i24 = lshr i32 %num.i24.0, 8
151  %den.i24 = lshr i32 %den.i24.0, 7
152  %result = udiv i32 %num.i24, %den.i24
153  store i32 %result, i32 addrspace(1)* %out, align 4
154  ret void
155}
156
157; FUNC-LABEL: {{^}}test_no_udiv24_i32_2:
158; RCP_IFLAG is for URECIP in the full 32b alg
159; SI: v_rcp_iflag
160; SI-NOT: v_rcp_f32
161
162; EG-NOT: UINT_TO_FLT
163; EG-NOT: RECIP_IEEE
164define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
165  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
166  %num = load i32, i32 addrspace(1) * %in, align 4
167  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
168  %num.i24.0 = shl i32 %num, 7
169  %den.i24.0 = shl i32 %den, 8
170  %num.i24 = lshr i32 %num.i24.0, 7
171  %den.i24 = lshr i32 %den.i24.0, 8
172  %result = udiv i32 %num.i24, %den.i24
173  store i32 %result, i32 addrspace(1)* %out, align 4
174  ret void
175}
176
177; FUNC-LABEL: {{^}}urem24_i8:
178; SI: v_cvt_f32_ubyte
179; SI: v_cvt_f32_ubyte
180; SI: v_rcp_iflag_f32
181; SI: v_cvt_u32_f32
182
183; EG: UINT_TO_FLT
184; EG-DAG: UINT_TO_FLT
185; EG-DAG: RECIP_IEEE
186; EG: FLT_TO_UINT
187define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
188  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
189  %num = load i8, i8 addrspace(1) * %in
190  %den = load i8, i8 addrspace(1) * %den_ptr
191  %result = urem i8 %num, %den
192  store i8 %result, i8 addrspace(1)* %out
193  ret void
194}
195
196; FUNC-LABEL: {{^}}urem24_i16:
197; SI: v_cvt_f32_u32
198; SI: v_cvt_f32_u32
199; SI: v_rcp_iflag_f32
200; SI: v_cvt_u32_f32
201
202; EG: UINT_TO_FLT
203; EG-DAG: UINT_TO_FLT
204; EG-DAG: RECIP_IEEE
205; EG: FLT_TO_UINT
206define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
207  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
208  %num = load i16, i16 addrspace(1) * %in, align 2
209  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
210  %result = urem i16 %num, %den
211  store i16 %result, i16 addrspace(1)* %out, align 2
212  ret void
213}
214
215; FUNC-LABEL: {{^}}urem24_i32:
216; SI-NOT: v_rcp_f32
217; EG-NOT: RECIP_IEEE
218define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
219  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
220  %num = load i32, i32 addrspace(1) * %in, align 4
221  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
222  %num.i24.0 = shl i32 %num, 8
223  %den.i24.0 = shl i32 %den, 8
224  %num.i24 = lshr i32 %num.i24.0, 8
225  %den.i24 = lshr i32 %den.i24.0, 8
226  %result = urem i32 %num.i24, %den.i24
227  store i32 %result, i32 addrspace(1)* %out, align 4
228  ret void
229}
230
231; FUNC-LABEL: {{^}}urem25_i32:
232; RCP_IFLAG is for URECIP in the full 32b alg
233; SI: v_rcp_iflag
234; SI-NOT: v_rcp_f32
235
236; EG-NOT: UINT_TO_FLT
237; EG-NOT: RECIP_IEEE
238define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
239  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
240  %num = load i32, i32 addrspace(1) * %in, align 4
241  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
242  %num.i24.0 = shl i32 %num, 7
243  %den.i24.0 = shl i32 %den, 7
244  %num.i24 = lshr i32 %num.i24.0, 7
245  %den.i24 = lshr i32 %den.i24.0, 7
246  %result = urem i32 %num.i24, %den.i24
247  store i32 %result, i32 addrspace(1)* %out, align 4
248  ret void
249}
250
251; FUNC-LABEL: {{^}}test_no_urem24_i32_1:
252; RCP_IFLAG is for URECIP in the full 32b alg
253; SI: v_rcp_iflag
254; SI-NOT: v_rcp_f32
255
256; EG-NOT: UINT_TO_FLT
257; EG-NOT: RECIP_IEEE
258define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
259  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
260  %num = load i32, i32 addrspace(1) * %in, align 4
261  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
262  %num.i24.0 = shl i32 %num, 8
263  %den.i24.0 = shl i32 %den, 7
264  %num.i24 = lshr i32 %num.i24.0, 8
265  %den.i24 = lshr i32 %den.i24.0, 7
266  %result = urem i32 %num.i24, %den.i24
267  store i32 %result, i32 addrspace(1)* %out, align 4
268  ret void
269}
270
271; FUNC-LABEL: {{^}}test_no_urem24_i32_2:
272; RCP_IFLAG is for URECIP in the full 32b alg
273; SI: v_rcp_iflag
274; SI-NOT: v_rcp_f32
275
276; EG-NOT: UINT_TO_FLT
277; EG-NOT: RECIP_IEEE
278define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
279  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
280  %num = load i32, i32 addrspace(1) * %in, align 4
281  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
282  %num.i24.0 = shl i32 %num, 7
283  %den.i24.0 = shl i32 %den, 8
284  %num.i24 = lshr i32 %num.i24.0, 7
285  %den.i24 = lshr i32 %den.i24.0, 8
286  %result = urem i32 %num.i24, %den.i24
287  store i32 %result, i32 addrspace(1)* %out, align 4
288  ret void
289}
290
291; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32:
292; SI-DAG: v_rcp_iflag_f32
293; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
294; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
295
296; EG: RECIP_IEEE
297define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
298  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
299  %num = load i32, i32 addrspace(1) * %in, align 4
300  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
301  %num.i16.0 = shl i32 %num, 16
302  %den.i23.0 = shl i32 %den, 9
303  %num.i16 = lshr i32 %num.i16.0, 16
304  %den.i23 = lshr i32 %den.i23.0, 9
305  %result = udiv i32 %num.i16, %den.i23
306  store i32 %result, i32 addrspace(1)* %out, align 4
307  ret void
308}
309
310; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32:
311; SI-DAG: v_rcp_iflag_f32
312; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
313; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
314
315; EG: RECIP_IEEE
316define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
317  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
318  %num = load i32, i32 addrspace(1) * %in, align 4
319  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
320  %num.i23.0 = shl i32 %num, 9
321  %den.i16.0 = shl i32 %den, 16
322  %num.i23 = lshr i32 %num.i23.0, 9
323  %den.i16 = lshr i32 %den.i16.0, 16
324  %result = udiv i32 %num.i23, %den.i16
325  store i32 %result, i32 addrspace(1)* %out, align 4
326  ret void
327}
328