1; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4
5declare i64 @_Z13get_global_idj(i32)
6
7define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)*  %buffer) {
8; GCN-LABEL: clmem_read_simplified:
9; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
10; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
11; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
12; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
13; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
14; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
15; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
16; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
17;
18; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
19; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
20; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
21; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
22; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
23; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
24; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
25; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
26;
27; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
28; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
29; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
30; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
31; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
32; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
33; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
34; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
35
36entry:
37  %call = tail call i64 @_Z13get_global_idj(i32 0)
38  %conv = and i64 %call, 255
39  %a0 = shl i64 %call, 7
40  %idx.ext11 = and i64 %a0, 4294934528
41  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
42  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
43
44  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
45  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
46  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
47  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
48  %add.1 = add i64 %load2, %load1
49
50  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
51  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
52  %add.2 = add i64 %load3, %add.1
53  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
54  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
55  %add.3 = add i64 %load4, %add.2
56
57  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
58  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
59  %add.4 = add i64 %load5, %add.3
60  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
61  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
62  %add.5 = add i64 %load6, %add.4
63
64  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
65  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
66  %add.6 = add i64 %load7, %add.5
67  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
68  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
69  %add.7 = add i64 %load8, %add.6
70
71  store i64 %add.7, i64 addrspace(1)* %saddr, align 8
72  ret void
73}
74
75define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
76; GCN-LABEL: clmem_read:
77; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
78; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
79; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
80; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
81; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
82; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
83; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
84; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
85; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
86; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
87; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
88;
89; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
90; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
91; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
92; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
93; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
94; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
95; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
96; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
97; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
98; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
99; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
100;
101; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
102; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
103; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
104; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
105; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
106; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
107; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
108; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
109; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
110; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
111; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
112entry:
113  %call = tail call i64 @_Z13get_global_idj(i32 0)
114  %conv = and i64 %call, 255
115  %a0 = shl i64 %call, 17
116  %idx.ext11 = and i64 %a0, 4261412864
117  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
118  %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
119  %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
120  br label %for.cond.preheader
121
122while.cond.loopexit:                              ; preds = %for.body
123  %dec = add nsw i32 %dec31, -1
124  %tobool = icmp eq i32 %dec31, 0
125  br i1 %tobool, label %while.end, label %for.cond.preheader
126
127for.cond.preheader:                               ; preds = %entry, %while.cond.loopexit
128  %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
129  %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
130  br label %for.body
131
132for.body:                                         ; preds = %for.body, %for.cond.preheader
133  %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
134  %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
135  %conv3 = zext i32 %block.029 to i64
136  %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
137  %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
138  %add = add i64 %load1, %sum.128
139
140  %add9 = or i32 %block.029, 256
141  %conv3.1 = zext i32 %add9 to i64
142  %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
143  %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
144  %add.1 = add i64 %load2, %add
145
146  %add9.1 = or i32 %block.029, 512
147  %conv3.2 = zext i32 %add9.1 to i64
148  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
149  %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
150  %add.2 = add i64 %l3, %add.1
151
152  %add9.2 = or i32 %block.029, 768
153  %conv3.3 = zext i32 %add9.2 to i64
154  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
155  %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
156  %add.3 = add i64 %l4, %add.2
157
158  %add9.3 = or i32 %block.029, 1024
159  %conv3.4 = zext i32 %add9.3 to i64
160  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
161  %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
162  %add.4 = add i64 %l5, %add.3
163
164  %add9.4 = or i32 %block.029, 1280
165  %conv3.5 = zext i32 %add9.4 to i64
166  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
167  %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
168  %add.5 = add i64 %l6, %add.4
169
170  %add9.5 = or i32 %block.029, 1536
171  %conv3.6 = zext i32 %add9.5 to i64
172  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
173  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
174  %add.6 = add i64 %load7, %add.5
175
176  %add9.6 = or i32 %block.029, 1792
177  %conv3.7 = zext i32 %add9.6 to i64
178  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
179  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
180  %add.7 = add i64 %load8, %add.6
181
182  %add9.7 = or i32 %block.029, 2048
183  %conv3.8 = zext i32 %add9.7 to i64
184  %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
185  %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
186  %add.8 = add i64 %load9, %add.7
187
188  %add9.8 = or i32 %block.029, 2304
189  %conv3.9 = zext i32 %add9.8 to i64
190  %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
191  %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
192  %add.9 = add i64 %load10, %add.8
193
194  %add9.9 = or i32 %block.029, 2560
195  %conv3.10 = zext i32 %add9.9 to i64
196  %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
197  %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
198  %add.10 = add i64 %load11, %add.9
199
200  %add9.31 = add nuw nsw i32 %block.029, 8192
201  %cmp.31 = icmp ult i32 %add9.31, 4194304
202  br i1 %cmp.31, label %for.body, label %while.cond.loopexit
203
204while.end:                                        ; preds = %while.cond.loopexit
205  store i64 %add.10, i64 addrspace(1)* %a1, align 8
206  ret void
207}
208
209; using 32bit address.
210define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
211; GCN-LABEL: Address32:
212; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
213; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
214; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
215; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
216; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
217; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
218; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
219; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
220; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
221; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
222;
223; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
224; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
225; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
226; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
227; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
228; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
229; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
230; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
231; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
232; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
233;
234; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
235; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
236; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
237; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
238; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
239; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
240; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
241; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
242; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
243; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
244entry:
245   %call = tail call i64 @_Z13get_global_idj(i32 0)
246   %conv = and i64 %call, 255
247   %id = shl i64 %call, 7
248   %idx.ext11 = and i64 %id, 4294934528
249   %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
250   %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
251
252   %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
253   %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
254
255   %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
256   %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
257   %add.1 = add i32 %load2, %load1
258
259   %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
260   %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
261   %add.2 = add i32 %load3, %add.1
262
263   %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
264   %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
265   %add.3 = add i32 %load4, %add.2
266
267   %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
268   %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
269   %add.4 = add i32 %load5, %add.3
270
271   %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
272   %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
273   %add.5 = add i32 %load6, %add.4
274
275   %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
276   %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
277   %add.6 = add i32 %load7, %add.5
278
279   %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
280   %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
281   %add.7 = add i32 %load8, %add.6
282
283   %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
284   %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
285   %add.8 = add i32 %load9, %add.7
286
287   %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
288   %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
289   %add.9 = add i32 %load10, %add.8
290
291   store i32 %add.9, i32 addrspace(1)* %addr, align 4
292   ret void
293}
294
295define amdgpu_kernel void @Offset64(i8 addrspace(1)*  %buffer) {
296; GCN-LABEL: Offset64:
297; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
298; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
299; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
300; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
301;
302; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
303; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
304; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
305; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
306;
307; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
308; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
309; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
310; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
311entry:
312  %call = tail call i64 @_Z13get_global_idj(i32 0)
313  %conv = and i64 %call, 255
314  %a0 = shl i64 %call, 7
315  %idx.ext11 = and i64 %a0, 4294934528
316  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
317  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
318
319  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
320  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
321
322  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
323  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
324
325  %add1 = add i64 %load2, %load1
326
327  %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
328  %load3 = load i64, i64 addrspace(1)* %addr3, align 8
329
330  %add2 = add i64 %load3, %add1
331
332  %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
333  %load4 = load i64, i64 addrspace(1)* %addr4, align 8
334  %add4 = add i64 %load4, %add2
335
336  store i64 %add4, i64 addrspace(1)* %saddr, align 8
337  ret void
338}
339
340; TODO: Support load4 as anchor instruction.
341define amdgpu_kernel void @p32Offset64(i8 addrspace(1)*  %buffer) {
342; GCN-LABEL: p32Offset64:
343; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
344; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
345; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
346; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
347;
348; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
349; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
350; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
351; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
352;
353; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
354; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
355; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
356; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
357entry:
358  %call = tail call i64 @_Z13get_global_idj(i32 0)
359  %conv = and i64 %call, 255
360  %a0 = shl i64 %call, 7
361  %idx.ext11 = and i64 %a0, 4294934528
362  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
363  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
364
365  %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
366  %load1 = load i32, i32 addrspace(1)* %addr1, align 8
367
368  %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
369  %load2 = load i32, i32 addrspace(1)* %addr2, align 8
370
371  %add1 = add i32 %load2, %load1
372
373  %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
374  %load3 = load i32, i32 addrspace(1)* %addr3, align 8
375
376  %add2 = add i32 %load3, %add1
377
378  %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
379  %load4 = load i32, i32 addrspace(1)* %addr4, align 8
380  %add4 = add i32 %load4, %add2
381
382  store i32 %add4, i32 addrspace(1)* %saddr, align 8
383  ret void
384}
385
386define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
387; GCN-LABEL: DiffBase:
388; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
389; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
390; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
391; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
392; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
393; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
394;
395; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
396; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
397; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
398; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
399; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
400; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
401;
402; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
403; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
404; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
405; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
406; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
407; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
408                                    i8 addrspace(1)* %buffer2) {
409entry:
410  %call = tail call i64 @_Z13get_global_idj(i32 0)
411  %conv = and i64 %call, 255
412  %a0 = shl i64 %call, 7
413  %idx.ext11 = and i64 %a0, 4294934528
414  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
415  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
416
417  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
418  %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
419
420  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
421  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
422  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
423  %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
424  %add1 = add i64 %load2, %load1
425  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
426  %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
427  %add2 = add i64 %load3, %add1
428
429  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
430  %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
431
432  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
433  %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
434  %add3 = add i64 %load5, %load4
435
436  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
437  %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
438  %add4 = add i64 %load6, %add3
439
440  %add5 = add i64 %add2, %add4
441
442  store i64 %add5, i64 addrspace(1)* %saddr, align 8
443  ret void
444}
445
446define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
447; GCN-LABEL: ReverseOrder:
448; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
449; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
450; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
451; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
452; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
453; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
454; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
455; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
456;
457; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
458; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
459; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
460; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
461; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
462; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
463; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
464; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
465;
466; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
467; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
468; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
469; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
470; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
471; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
472; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
473; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
474entry:
475  %call = tail call i64 @_Z13get_global_idj(i32 0)
476  %conv = and i64 %call, 255
477  %a0 = shl i64 %call, 7
478  %idx.ext11 = and i64 %a0, 4294934528
479  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
480  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
481
482  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
483  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
484
485  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
486  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
487  %add7 = add i64 %load8, %load1
488
489  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
490  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
491  %add6 = add i64 %load7, %add7
492
493  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
494  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
495  %add5 = add i64 %load6, %add6
496
497  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
498  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
499  %add4 = add i64 %load5, %add5
500
501  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
502  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
503  %add3 = add i64 %load4, %add4
504
505  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
506  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
507  %add2 = add i64 %load3, %add3
508
509  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
510  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
511  %add1 = add i64 %load2, %add2
512
513  store i64 %add1, i64 addrspace(1)* %saddr, align 8
514  ret void
515}
516
517define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
518; GCN-LABEL: negativeoffset:
519; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
520; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
521;
522; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
523; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
524;
525; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
526; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
527entry:
528  %call = tail call i64 @_Z13get_global_idj(i32 0) #2
529  %conv = and i64 %call, 255
530  %0 = shl i64 %call, 7
531  %idx.ext11 = and i64 %0, 4294934528
532  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
533  %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
534
535  %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
536
537  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
538  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
539
540  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
541  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
542
543
544  %add = add i64 %load2, %load1
545
546  store i64 %add, i64 addrspace(1)* %buffer_head, align 8
547  ret void
548}
549