1; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4 5declare i64 @_Z13get_global_idj(i32) 6 7define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { 8; GCN-LABEL: clmem_read_simplified: 9; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 10; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 11; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 12; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 13; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 14; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 15; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 16; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 17; 18; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 19; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 20; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 21; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 22; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 23; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 24; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 25; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 26; 27; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 28; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 29; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 30; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 31; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 32; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 33; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 34; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 35 36entry: 37 %call = tail call i64 @_Z13get_global_idj(i32 0) 38 %conv = and i64 %call, 255 39 %a0 = shl i64 %call, 7 40 %idx.ext11 = and i64 %a0, 4294934528 41 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 42 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 43 44 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 45 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 46 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 47 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 48 %add.1 = add i64 %load2, %load1 49 50 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 51 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 52 %add.2 = add i64 %load3, %add.1 53 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 54 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 55 %add.3 = add i64 %load4, %add.2 56 57 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 58 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 59 %add.4 = add i64 %load5, %add.3 60 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 61 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 62 %add.5 = add i64 %load6, %add.4 63 64 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 65 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 66 %add.6 = add i64 %load7, %add.5 67 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 68 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 69 %add.7 = add i64 %load8, %add.6 70 71 store i64 %add.7, i64 addrspace(1)* %saddr, align 8 72 ret void 73} 74 75define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { 76; GCN-LABEL: clmem_read: 77; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 78; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 79; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 80; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 81; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 82; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 83; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 84; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 85; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 86; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 87; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 88; 89; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 90; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 91; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 92; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 93; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 94; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 95; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 96; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 97; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 98; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 99; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 100; 101; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 102; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 103; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 104; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 105; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 106; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 107; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 108; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 109; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 110; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 111; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 112entry: 113 %call = tail call i64 @_Z13get_global_idj(i32 0) 114 %conv = and i64 %call, 255 115 %a0 = shl i64 %call, 17 116 %idx.ext11 = and i64 %a0, 4261412864 117 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 118 %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 119 %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv 120 br label %for.cond.preheader 121 122while.cond.loopexit: ; preds = %for.body 123 %dec = add nsw i32 %dec31, -1 124 %tobool = icmp eq i32 %dec31, 0 125 br i1 %tobool, label %while.end, label %for.cond.preheader 126 127for.cond.preheader: ; preds = %entry, %while.cond.loopexit 128 %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ] 129 %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ] 130 br label %for.body 131 132for.body: ; preds = %for.body, %for.cond.preheader 133 %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ] 134 %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ] 135 %conv3 = zext i32 %block.029 to i64 136 %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3 137 %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8 138 %add = add i64 %load1, %sum.128 139 140 %add9 = or i32 %block.029, 256 141 %conv3.1 = zext i32 %add9 to i64 142 %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1 143 %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8 144 %add.1 = add i64 %load2, %add 145 146 %add9.1 = or i32 %block.029, 512 147 %conv3.2 = zext i32 %add9.1 to i64 148 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2 149 %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 150 %add.2 = add i64 %l3, %add.1 151 152 %add9.2 = or i32 %block.029, 768 153 %conv3.3 = zext i32 %add9.2 to i64 154 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3 155 %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 156 %add.3 = add i64 %l4, %add.2 157 158 %add9.3 = or i32 %block.029, 1024 159 %conv3.4 = zext i32 %add9.3 to i64 160 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4 161 %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 162 %add.4 = add i64 %l5, %add.3 163 164 %add9.4 = or i32 %block.029, 1280 165 %conv3.5 = zext i32 %add9.4 to i64 166 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5 167 %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 168 %add.5 = add i64 %l6, %add.4 169 170 %add9.5 = or i32 %block.029, 1536 171 %conv3.6 = zext i32 %add9.5 to i64 172 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6 173 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 174 %add.6 = add i64 %load7, %add.5 175 176 %add9.6 = or i32 %block.029, 1792 177 %conv3.7 = zext i32 %add9.6 to i64 178 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7 179 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 180 %add.7 = add i64 %load8, %add.6 181 182 %add9.7 = or i32 %block.029, 2048 183 %conv3.8 = zext i32 %add9.7 to i64 184 %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8 185 %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8 186 %add.8 = add i64 %load9, %add.7 187 188 %add9.8 = or i32 %block.029, 2304 189 %conv3.9 = zext i32 %add9.8 to i64 190 %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9 191 %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8 192 %add.9 = add i64 %load10, %add.8 193 194 %add9.9 = or i32 %block.029, 2560 195 %conv3.10 = zext i32 %add9.9 to i64 196 %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10 197 %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8 198 %add.10 = add i64 %load11, %add.9 199 200 %add9.31 = add nuw nsw i32 %block.029, 8192 201 %cmp.31 = icmp ult i32 %add9.31, 4194304 202 br i1 %cmp.31, label %for.body, label %while.cond.loopexit 203 204while.end: ; preds = %while.cond.loopexit 205 store i64 %add.10, i64 addrspace(1)* %a1, align 8 206 ret void 207} 208 209; using 32bit address. 210define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { 211; GCN-LABEL: Address32: 212; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 213; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 214; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 215; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 216; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 217; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 218; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 219; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 220; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 221; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 222; 223; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 224; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 225; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 226; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 227; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 228; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 229; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 230; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 231; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 232; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 233; 234; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 235; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 236; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 237; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 238; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 239; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 240; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 241; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 242; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 243; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 244entry: 245 %call = tail call i64 @_Z13get_global_idj(i32 0) 246 %conv = and i64 %call, 255 247 %id = shl i64 %call, 7 248 %idx.ext11 = and i64 %id, 4294934528 249 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 250 %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* 251 252 %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv 253 %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4 254 255 %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256 256 %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4 257 %add.1 = add i32 %load2, %load1 258 259 %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512 260 %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4 261 %add.2 = add i32 %load3, %add.1 262 263 %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768 264 %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4 265 %add.3 = add i32 %load4, %add.2 266 267 %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024 268 %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4 269 %add.4 = add i32 %load5, %add.3 270 271 %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280 272 %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4 273 %add.5 = add i32 %load6, %add.4 274 275 %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536 276 %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4 277 %add.6 = add i32 %load7, %add.5 278 279 %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792 280 %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4 281 %add.7 = add i32 %load8, %add.6 282 283 %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048 284 %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4 285 %add.8 = add i32 %load9, %add.7 286 287 %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304 288 %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4 289 %add.9 = add i32 %load10, %add.8 290 291 store i32 %add.9, i32 addrspace(1)* %addr, align 4 292 ret void 293} 294 295define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { 296; GCN-LABEL: Offset64: 297; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 298; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 299; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 300; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 301; 302; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 303; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 304; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 305; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 306; 307; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 308; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 309; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 310; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 311entry: 312 %call = tail call i64 @_Z13get_global_idj(i32 0) 313 %conv = and i64 %call, 255 314 %a0 = shl i64 %call, 7 315 %idx.ext11 = and i64 %a0, 4294934528 316 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 317 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 318 319 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 320 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 321 322 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400 323 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 324 325 %add1 = add i64 %load2, %load1 326 327 %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656 328 %load3 = load i64, i64 addrspace(1)* %addr3, align 8 329 330 %add2 = add i64 %load3, %add1 331 332 %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912 333 %load4 = load i64, i64 addrspace(1)* %addr4, align 8 334 %add4 = add i64 %load4, %add2 335 336 store i64 %add4, i64 addrspace(1)* %saddr, align 8 337 ret void 338} 339 340; TODO: Support load4 as anchor instruction. 341define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) { 342; GCN-LABEL: p32Offset64: 343; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 344; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 345; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 346; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 347; 348; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 349; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 350; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 351; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 352; 353; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 354; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 355; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 356; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 357entry: 358 %call = tail call i64 @_Z13get_global_idj(i32 0) 359 %conv = and i64 %call, 255 360 %a0 = shl i64 %call, 7 361 %idx.ext11 = and i64 %a0, 4294934528 362 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 363 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* 364 365 %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv 366 %load1 = load i32, i32 addrspace(1)* %addr1, align 8 367 368 %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400 369 %load2 = load i32, i32 addrspace(1)* %addr2, align 8 370 371 %add1 = add i32 %load2, %load1 372 373 %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656 374 %load3 = load i32, i32 addrspace(1)* %addr3, align 8 375 376 %add2 = add i32 %load3, %add1 377 378 %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912 379 %load4 = load i32, i32 addrspace(1)* %addr4, align 8 380 %add4 = add i32 %load4, %add2 381 382 store i32 %add4, i32 addrspace(1)* %saddr, align 8 383 ret void 384} 385 386define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, 387; GCN-LABEL: DiffBase: 388; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 389; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 390; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 391; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 392; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 393; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 394; 395; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 396; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 397; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 398; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 399; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 400; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 401; 402; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 403; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 404; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 405; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 406; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 407; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 408 i8 addrspace(1)* %buffer2) { 409entry: 410 %call = tail call i64 @_Z13get_global_idj(i32 0) 411 %conv = and i64 %call, 255 412 %a0 = shl i64 %call, 7 413 %idx.ext11 = and i64 %a0, 4294934528 414 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11 415 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 416 417 %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11 418 %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)* 419 420 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512 421 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 422 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768 423 %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 424 %add1 = add i64 %load2, %load1 425 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024 426 %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 427 %add2 = add i64 %load3, %add1 428 429 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280 430 %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 431 432 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536 433 %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 434 %add3 = add i64 %load5, %load4 435 436 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792 437 %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 438 %add4 = add i64 %load6, %add3 439 440 %add5 = add i64 %add2, %add4 441 442 store i64 %add5, i64 addrspace(1)* %saddr, align 8 443 ret void 444} 445 446define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { 447; GCN-LABEL: ReverseOrder: 448; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 449; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 450; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 451; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 452; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 453; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 454; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 455; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 456; 457; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 458; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 459; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 460; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 461; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 462; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 463; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 464; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 465; 466; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 467; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 468; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 469; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 470; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 471; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 472; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 473; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 474entry: 475 %call = tail call i64 @_Z13get_global_idj(i32 0) 476 %conv = and i64 %call, 255 477 %a0 = shl i64 %call, 7 478 %idx.ext11 = and i64 %a0, 4294934528 479 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 480 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 481 482 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 483 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 484 485 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 486 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 487 %add7 = add i64 %load8, %load1 488 489 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 490 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 491 %add6 = add i64 %load7, %add7 492 493 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 494 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 495 %add5 = add i64 %load6, %add6 496 497 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 498 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 499 %add4 = add i64 %load5, %add5 500 501 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 502 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 503 %add3 = add i64 %load4, %add4 504 505 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 506 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 507 %add2 = add i64 %load3, %add3 508 509 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 510 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 511 %add1 = add i64 %load2, %add2 512 513 store i64 %add1, i64 addrspace(1)* %saddr, align 8 514 ret void 515} 516 517define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) { 518; GCN-LABEL: negativeoffset: 519; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 520; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 521; 522; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 523; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 524; 525; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 526; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 527entry: 528 %call = tail call i64 @_Z13get_global_idj(i32 0) #2 529 %conv = and i64 %call, 255 530 %0 = shl i64 %call, 7 531 %idx.ext11 = and i64 %0, 4294934528 532 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 533 %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 534 535 %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv 536 537 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656 538 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 539 540 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912 541 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 542 543 544 %add = add i64 %load2, %load1 545 546 store i64 %add, i64 addrspace(1)* %buffer_head, align 8 547 ret void 548} 549