1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4
5
6; FUNC-LABEL: {{^}}local_size_x:
7; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
8; EG: MOV * [[VAL]], KC0[1].Z
9
10; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
11; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
12; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1
13; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4
14
15; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
16; GCN: buffer_store_dword [[VVAL]]
17define amdgpu_kernel void @local_size_x(i32 addrspace(1)* %out) {
18entry:
19  %0 = call i32 @llvm.r600.read.local.size.x() #0
20  store i32 %0, i32 addrspace(1)* %out
21  ret void
22}
23
24; FUNC-LABEL: {{^}}local_size_y:
25; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
26; EG: MOV * [[VAL]], KC0[1].W
27
28; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
29; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
30; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
31; GCN: buffer_store_dword [[VVAL]]
32define amdgpu_kernel void @local_size_y(i32 addrspace(1)* %out) {
33entry:
34  %0 = call i32 @llvm.r600.read.local.size.y() #0
35  store i32 %0, i32 addrspace(1)* %out
36  ret void
37}
38
39; FUNC-LABEL: {{^}}local_size_z:
40; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
41; EG: MOV * [[VAL]], KC0[2].X
42
43; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
44; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
45; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
46; GCN: buffer_store_dword [[VVAL]]
47define amdgpu_kernel void @local_size_z(i32 addrspace(1)* %out) {
48entry:
49  %0 = call i32 @llvm.r600.read.local.size.z() #0
50  store i32 %0, i32 addrspace(1)* %out
51  ret void
52}
53
54; FUNC-LABEL: {{^}}local_size_xy:
55; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
56; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
57; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
58; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
59; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
60; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
61; GCN: buffer_store_dword [[VAL]]
62define amdgpu_kernel void @local_size_xy(i32 addrspace(1)* %out) {
63entry:
64  %x = call i32 @llvm.r600.read.local.size.x() #0
65  %y = call i32 @llvm.r600.read.local.size.y() #0
66  %val = mul i32 %x, %y
67  store i32 %val, i32 addrspace(1)* %out
68  ret void
69}
70
71; FUNC-LABEL: {{^}}local_size_xz:
72
73; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
74; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
75; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
76; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
77; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff
78; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
79; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
80; GCN: buffer_store_dword [[VAL]]
81define amdgpu_kernel void @local_size_xz(i32 addrspace(1)* %out) {
82entry:
83  %x = call i32 @llvm.r600.read.local.size.x() #0
84  %z = call i32 @llvm.r600.read.local.size.z() #0
85  %val = mul i32 %x, %z
86  store i32 %val, i32 addrspace(1)* %out
87  ret void
88}
89
90; FUNC-LABEL: {{^}}local_size_yz:
91; HSA: enable_sgpr_private_segment_buffer = 1
92; HSA: enable_sgpr_dispatch_ptr = 1
93
94; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
95; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
96; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
97; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
98; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
99; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
100; GCN: buffer_store_dword [[VAL]]
101define amdgpu_kernel void @local_size_yz(i32 addrspace(1)* %out) {
102entry:
103  %y = call i32 @llvm.r600.read.local.size.y() #0
104  %z = call i32 @llvm.r600.read.local.size.z() #0
105  %val = mul i32 %y, %z
106  store i32 %val, i32 addrspace(1)* %out
107  ret void
108}
109
110; FUNC-LABEL: {{^}}local_size_xyz:
111; HSA: enable_sgpr_private_segment_buffer = 1
112; HSA: enable_sgpr_dispatch_ptr = 1
113
114; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
115; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
116; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
117; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
118; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
119; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
120; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
121; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
122; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
123; GCN: buffer_store_dword [[VAL]]
124define amdgpu_kernel void @local_size_xyz(i32 addrspace(1)* %out) {
125entry:
126  %x = call i32 @llvm.r600.read.local.size.x() #0
127  %y = call i32 @llvm.r600.read.local.size.y() #0
128  %z = call i32 @llvm.r600.read.local.size.z() #0
129  %xy = mul i32 %x, %y
130  %xyz = add i32 %xy, %z
131  store i32 %xyz, i32 addrspace(1)* %out
132  ret void
133}
134
135; FUNC-LABEL: {{^}}local_size_x_known_bits:
136; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
137; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
138; GCN-NOT: 0xffff
139; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
140; GCN-NEXT: buffer_store_dword [[VVAL]]
141define amdgpu_kernel void @local_size_x_known_bits(i32 addrspace(1)* %out) {
142entry:
143  %size = call i32 @llvm.r600.read.local.size.x() #0
144  %shl = shl i32 %size, 16
145  %shr = lshr i32 %shl, 16
146  store i32 %shr, i32 addrspace(1)* %out
147  ret void
148}
149
150; FUNC-LABEL: {{^}}local_size_y_known_bits:
151; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
152; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
153; GCN-NOT: 0xffff
154; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
155; GCN-NEXT: buffer_store_dword [[VVAL]]
156define amdgpu_kernel void @local_size_y_known_bits(i32 addrspace(1)* %out) {
157entry:
158  %size = call i32 @llvm.r600.read.local.size.y() #0
159  %shl = shl i32 %size, 16
160  %shr = lshr i32 %shl, 16
161  store i32 %shr, i32 addrspace(1)* %out
162  ret void
163}
164
165; FUNC-LABEL: {{^}}local_size_z_known_bits:
166; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
167; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
168; GCN-NOT: 0xffff
169; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
170; GCN-NEXT: buffer_store_dword [[VVAL]]
171define amdgpu_kernel void @local_size_z_known_bits(i32 addrspace(1)* %out) {
172entry:
173  %size = call i32 @llvm.r600.read.local.size.z() #0
174  %shl = shl i32 %size, 16
175  %shr = lshr i32 %shl, 16
176  store i32 %shr, i32 addrspace(1)* %out
177  ret void
178}
179
180declare i32 @llvm.r600.read.local.size.x() #0
181declare i32 @llvm.r600.read.local.size.y() #0
182declare i32 @llvm.r600.read.local.size.z() #0
183
184attributes #0 = { nounwind readnone }
185