1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
3
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
5
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
7
8; FUNC-LABEL: {{^}}udiv_i32:
9; EG-NOT: SETGE_INT
10; EG: CF_END
11
12; SI: v_rcp_iflag_f32_e32
13define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
14  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
15  %a = load i32, i32 addrspace(1)* %in
16  %b = load i32, i32 addrspace(1)* %b_ptr
17  %result = udiv i32 %a, %b
18  store i32 %result, i32 addrspace(1)* %out
19  ret void
20}
21
22; FUNC-LABEL: {{^}}s_udiv_i32:
23; SI: v_rcp_iflag_f32_e32
24define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
25  %result = udiv i32 %a, %b
26  store i32 %result, i32 addrspace(1)* %out
27  ret void
28}
29
30
31; The code generated by udiv is long and complex and may frequently
32; change. The goal of this test is to make sure the ISel doesn't fail
33; when it gets a v4i32 udiv
34
35; FUNC-LABEL: {{^}}udiv_v2i32:
36; EG: CF_END
37
38; SI: v_rcp_iflag_f32_e32
39; SI: v_rcp_iflag_f32_e32
40; SI: s_endpgm
41define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
42  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
43  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
44  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
45  %result = udiv <2 x i32> %a, %b
46  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
47  ret void
48}
49
50; FUNC-LABEL: {{^}}udiv_v4i32:
51; EG: CF_END
52; SI: s_endpgm
53define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
54  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
55  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
56  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
57  %result = udiv <4 x i32> %a, %b
58  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
59  ret void
60}
61
62; FUNC-LABEL: {{^}}udiv_i32_div_pow2:
63; SI: buffer_load_dword [[VAL:v[0-9]+]]
64; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]]
65; SI: buffer_store_dword [[RESULT]]
66define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
67  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
68  %a = load i32, i32 addrspace(1)* %in
69  %result = udiv i32 %a, 16
70  store i32 %result, i32 addrspace(1)* %out
71  ret void
72}
73
74; FUNC-LABEL: {{^}}udiv_i32_div_k_even:
75; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
76; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfabbd9c1
77; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
78; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
79; SI: buffer_store_dword [[RESULT]]
80define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
81  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
82  %a = load i32, i32 addrspace(1)* %in
83  %result = udiv i32 %a, 34259182
84  store i32 %result, i32 addrspace(1)* %out
85  ret void
86}
87
88; FUNC-LABEL: {{^}}udiv_i32_div_k_odd:
89; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
90; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7d5deca3
91; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
92; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
93; SI: buffer_store_dword [[RESULT]]
94define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
95  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
96  %a = load i32, i32 addrspace(1)* %in
97  %result = udiv i32 %a, 34259183
98  store i32 %result, i32 addrspace(1)* %out
99  ret void
100}
101
102; FUNC-LABEL: {{^}}v_udiv_i8:
103; SI: v_rcp_iflag_f32
104; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
105; SI: buffer_store_dword [[TRUNC]]
106define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
107  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
108  %num = load i8, i8 addrspace(1) * %in
109  %den = load i8, i8 addrspace(1) * %den_ptr
110  %result = udiv i8 %num, %den
111  %result.ext = zext i8 %result to i32
112  store i32 %result.ext, i32 addrspace(1)* %out
113  ret void
114}
115
116; FUNC-LABEL: {{^}}v_udiv_i16:
117; SI: v_rcp_iflag_f32
118; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
119; SI: buffer_store_dword [[TRUNC]]
120define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
121  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
122  %num = load i16, i16 addrspace(1) * %in
123  %den = load i16, i16 addrspace(1) * %den_ptr
124  %result = udiv i16 %num, %den
125  %result.ext = zext i16 %result to i32
126  store i32 %result.ext, i32 addrspace(1)* %out
127  ret void
128}
129
130; FUNC-LABEL: {{^}}v_udiv_i23:
131; SI: v_rcp_iflag_f32
132; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
133; SI: buffer_store_dword [[TRUNC]]
134define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
135  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
136  %num = load i23, i23 addrspace(1) * %in
137  %den = load i23, i23 addrspace(1) * %den_ptr
138  %result = udiv i23 %num, %den
139  %result.ext = zext i23 %result to i32
140  store i32 %result.ext, i32 addrspace(1)* %out
141  ret void
142}
143
144; FUNC-LABEL: {{^}}v_udiv_i24:
145; SI-NOT: v_rcp_f32
146define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
147  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
148  %num = load i24, i24 addrspace(1) * %in
149  %den = load i24, i24 addrspace(1) * %den_ptr
150  %result = udiv i24 %num, %den
151  %result.ext = zext i24 %result to i32
152  store i32 %result.ext, i32 addrspace(1)* %out
153  ret void
154}
155
156; FUNC-LABEL: @scalarize_mulhu_4xi32
157; SI: v_mul_hi_u32
158; SI: v_mul_hi_u32
159; SI: v_mul_hi_u32
160; SI: v_mul_hi_u32
161
162define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
163  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
164  %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
165  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
166  ret void
167}
168
169; FUNC-LABEL: {{^}}test_udiv2:
170; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
171define amdgpu_kernel void @test_udiv2(i32 %p) {
172  %i = udiv i32 %p, 2
173  store volatile i32 %i, i32 addrspace(1)* undef
174  ret void
175}
176
177; FUNC-LABEL: {{^}}test_udiv_3_mulhu:
178; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
179; SI: v_mul_hi_u32 v0, {{s[0-9]+}}, {{v[0-9]+}}
180; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
181define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
182   %i = udiv i32 %p, 3
183   store volatile i32 %i, i32 addrspace(1)* undef
184   ret void
185}
186
187; GCN-LABEL: {{^}}fdiv_test_denormals
188; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
189define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
190bb:
191  %tmp = load i8, i8 addrspace(1)* null, align 1
192  %tmp1 = sext i8 %tmp to i32
193  %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
194  %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
195  %tmp4 = sext i8 %tmp3 to i32
196  %tmp5 = sdiv i32 %tmp1, %tmp4
197  %tmp6 = trunc i32 %tmp5 to i8
198  store i8 %tmp6, i8 addrspace(1)* null, align 1
199  ret void
200}
201