1; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s 2 3@lds = addrspace(3) global [512 x float] undef, align 4 4@lds.f64 = addrspace(3) global [512 x double] undef, align 8 5 6 7; SI-LABEL: @simple_write2_one_val_f32 8; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] 9; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 10; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 11; SI: s_endpgm 12define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 13 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 14 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i 15 %val = load float, float addrspace(1)* %in.gep, align 4 16 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 17 store float %val, float addrspace(3)* %arrayidx0, align 4 18 %add.x = add nsw i32 %x.i, 8 19 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 20 store float %val, float addrspace(3)* %arrayidx1, align 4 21 ret void 22} 23 24; SI-LABEL: @simple_write2_two_val_f32 25; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 26; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 27; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 28; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 29; SI: s_endpgm 30define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 31 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 32 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 33 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 34 %val0 = load float, float addrspace(1)* %in.gep.0, align 4 35 %val1 = load float, float addrspace(1)* %in.gep.1, align 4 36 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 37 store float %val0, float addrspace(3)* %arrayidx0, align 4 38 %add.x = add nsw i32 %x.i, 8 39 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 40 store float %val1, float addrspace(3)* %arrayidx1, align 4 41 ret void 42} 43 44; SI-LABEL: @simple_write2_two_val_f32_volatile_0 45; SI-NOT: ds_write2_b32 46; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} 47; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 48; SI: s_endpgm 49define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 50 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 51 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 52 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 53 %val0 = load float, float addrspace(1)* %in0.gep, align 4 54 %val1 = load float, float addrspace(1)* %in1.gep, align 4 55 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 56 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 57 %add.x = add nsw i32 %x.i, 8 58 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 59 store float %val1, float addrspace(3)* %arrayidx1, align 4 60 ret void 61} 62 63; SI-LABEL: @simple_write2_two_val_f32_volatile_1 64; SI-NOT: ds_write2_b32 65; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} 66; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 67; SI: s_endpgm 68define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 69 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 70 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 71 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 72 %val0 = load float, float addrspace(1)* %in0.gep, align 4 73 %val1 = load float, float addrspace(1)* %in1.gep, align 4 74 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 75 store float %val0, float addrspace(3)* %arrayidx0, align 4 76 %add.x = add nsw i32 %x.i, 8 77 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 78 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 79 ret void 80} 81 82; 2 data subregisters from different super registers. 83; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 84; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} 85; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} 86; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 87; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 88; SI: s_endpgm 89define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 90 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 91 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 92 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 93 %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 94 %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 95 %val0.0 = extractelement <2 x float> %val0, i32 0 96 %val1.1 = extractelement <2 x float> %val1, i32 1 97 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 98 store float %val0.0, float addrspace(3)* %arrayidx0, align 4 99 %add.x = add nsw i32 %x.i, 8 100 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 101 store float %val1.1, float addrspace(3)* %arrayidx1, align 4 102 ret void 103} 104 105; SI-LABEL: @simple_write2_two_val_subreg2_f32 106; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} 107; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 108; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 109; SI: s_endpgm 110define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 111 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 112 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 113 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 114 %val0 = extractelement <2 x float> %val, i32 0 115 %val1 = extractelement <2 x float> %val, i32 1 116 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 117 store float %val0, float addrspace(3)* %arrayidx0, align 4 118 %add.x = add nsw i32 %x.i, 8 119 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 120 store float %val1, float addrspace(3)* %arrayidx1, align 4 121 ret void 122} 123 124; SI-LABEL: @simple_write2_two_val_subreg4_f32 125; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} 126; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 127; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 128; SI: s_endpgm 129define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { 130 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 131 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i 132 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 133 %val0 = extractelement <4 x float> %val, i32 0 134 %val1 = extractelement <4 x float> %val, i32 3 135 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 136 store float %val0, float addrspace(3)* %arrayidx0, align 4 137 %add.x = add nsw i32 %x.i, 8 138 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 139 store float %val1, float addrspace(3)* %arrayidx1, align 4 140 ret void 141} 142 143; SI-LABEL: @simple_write2_two_val_max_offset_f32 144; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 145; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 146; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 147; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 148; SI: s_endpgm 149define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 150 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 151 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 152 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 153 %val0 = load float, float addrspace(1)* %in.gep.0, align 4 154 %val1 = load float, float addrspace(1)* %in.gep.1, align 4 155 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 156 store float %val0, float addrspace(3)* %arrayidx0, align 4 157 %add.x = add nsw i32 %x.i, 255 158 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 159 store float %val1, float addrspace(3)* %arrayidx1, align 4 160 ret void 161} 162 163; SI-LABEL: @simple_write2_two_val_too_far_f32 164; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} 165; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 166; SI: s_endpgm 167define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 168 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 169 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 170 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 171 %val0 = load float, float addrspace(1)* %in0.gep, align 4 172 %val1 = load float, float addrspace(1)* %in1.gep, align 4 173 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 174 store float %val0, float addrspace(3)* %arrayidx0, align 4 175 %add.x = add nsw i32 %x.i, 257 176 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 177 store float %val1, float addrspace(3)* %arrayidx1, align 4 178 ret void 179} 180 181; SI-LABEL: @simple_write2_two_val_f32_x2 182; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 183; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 184; SI: s_endpgm 185define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 186 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 187 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 188 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 189 %val0 = load float, float addrspace(1)* %in0.gep, align 4 190 %val1 = load float, float addrspace(1)* %in1.gep, align 4 191 192 %idx.0 = add nsw i32 %tid.x, 0 193 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 194 store float %val0, float addrspace(3)* %arrayidx0, align 4 195 196 %idx.1 = add nsw i32 %tid.x, 8 197 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 198 store float %val1, float addrspace(3)* %arrayidx1, align 4 199 200 %idx.2 = add nsw i32 %tid.x, 11 201 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 202 store float %val0, float addrspace(3)* %arrayidx2, align 4 203 204 %idx.3 = add nsw i32 %tid.x, 27 205 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 206 store float %val1, float addrspace(3)* %arrayidx3, align 4 207 208 ret void 209} 210 211; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base 212; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 213; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 214; SI: s_endpgm 215define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 216 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 217 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 218 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 219 %val0 = load float, float addrspace(1)* %in0.gep, align 4 220 %val1 = load float, float addrspace(1)* %in1.gep, align 4 221 222 %idx.0 = add nsw i32 %tid.x, 3 223 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 224 store float %val0, float addrspace(3)* %arrayidx0, align 4 225 226 %idx.1 = add nsw i32 %tid.x, 8 227 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 228 store float %val1, float addrspace(3)* %arrayidx1, align 4 229 230 %idx.2 = add nsw i32 %tid.x, 11 231 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 232 store float %val0, float addrspace(3)* %arrayidx2, align 4 233 234 %idx.3 = add nsw i32 %tid.x, 27 235 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 236 store float %val1, float addrspace(3)* %arrayidx3, align 4 237 238 ret void 239} 240 241; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 242; SI-NOT: ds_write2_b32 243; SI: ds_write_b32 244; SI: ds_write_b32 245; SI: s_endpgm 246define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { 247 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 248 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 249 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 250 %val0 = load float, float addrspace(1)* %in0.gep, align 4 251 %val1 = load float, float addrspace(1)* %in1.gep, align 4 252 253 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 254 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 255 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 256 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 257 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 258 259 ; Apply an additional offset after the vector that will be more obviously folded. 260 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 261 store float %val0, float addrspace(3)* %gep.0, align 4 262 263 %add.x = add nsw i32 %x.i, 8 264 store float %val1, float addrspace(3)* %gep.1.offset, align 4 265 ret void 266} 267 268; SI-LABEL: @simple_write2_one_val_f64 269; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], 270; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} 271; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 272; SI: s_endpgm 273define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 274 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 275 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 276 %val = load double, double addrspace(1)* %in.gep, align 8 277 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 278 store double %val, double addrspace(3)* %arrayidx0, align 8 279 %add.x = add nsw i32 %x.i, 8 280 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 281 store double %val, double addrspace(3)* %arrayidx1, align 8 282 ret void 283} 284 285; SI-LABEL: @misaligned_simple_write2_one_val_f64 286; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} 287; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} 288; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 289; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 290; SI: s_endpgm 291define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 292 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 293 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 294 %val = load double, double addrspace(1)* %in.gep, align 8 295 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 296 store double %val, double addrspace(3)* %arrayidx0, align 4 297 %add.x = add nsw i32 %x.i, 7 298 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 299 store double %val, double addrspace(3)* %arrayidx1, align 4 300 ret void 301} 302 303; SI-LABEL: @simple_write2_two_val_f64 304; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 305; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 306; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} 307; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 308; SI: s_endpgm 309define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 310 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 311 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i 312 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 313 %val0 = load double, double addrspace(1)* %in.gep.0, align 8 314 %val1 = load double, double addrspace(1)* %in.gep.1, align 8 315 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 316 store double %val0, double addrspace(3)* %arrayidx0, align 8 317 %add.x = add nsw i32 %x.i, 8 318 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 319 store double %val1, double addrspace(3)* %arrayidx1, align 8 320 ret void 321} 322 323@foo = addrspace(3) global [4 x i32] undef, align 4 324 325; SI-LABEL: @store_constant_adjacent_offsets 326; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 327; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 328define void @store_constant_adjacent_offsets() { 329 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 330 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 331 ret void 332} 333 334; SI-LABEL: @store_constant_disjoint_offsets 335; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} 336; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 337; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 338define void @store_constant_disjoint_offsets() { 339 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 340 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 341 ret void 342} 343 344@bar = addrspace(3) global [4 x i64] undef, align 4 345 346; SI-LABEL: @store_misaligned64_constant_offsets 347; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 348; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 349; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 350define void @store_misaligned64_constant_offsets() { 351 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 352 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 353 ret void 354} 355 356@bar.large = addrspace(3) global [4096 x i64] undef, align 4 357 358; SI-LABEL: @store_misaligned64_constant_large_offsets 359; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} 360; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} 361; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 362; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 363; SI: s_endpgm 364define void @store_misaligned64_constant_large_offsets() { 365 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 366 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 367 ret void 368} 369 370@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 371@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 372 373define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { 374 %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 375 %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 376 %val = load float, float addrspace(1)* %in 377 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 378 store float %val, float addrspace(3)* %arrayidx44, align 4 379 %add47 = add nsw i32 %x.i, 1 380 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 381 store float %val, float addrspace(3)* %arrayidx48, align 4 382 %add51 = add nsw i32 %x.i, 16 383 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 384 store float %val, float addrspace(3)* %arrayidx52, align 4 385 %add55 = add nsw i32 %x.i, 17 386 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 387 store float %val, float addrspace(3)* %arrayidx56, align 4 388 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 389 store float %val, float addrspace(3)* %arrayidx60, align 4 390 %add63 = add nsw i32 %y.i, 1 391 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 392 store float %val, float addrspace(3)* %arrayidx64, align 4 393 %add67 = add nsw i32 %y.i, 32 394 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 395 store float %val, float addrspace(3)* %arrayidx68, align 4 396 %add71 = add nsw i32 %y.i, 33 397 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 398 store float %val, float addrspace(3)* %arrayidx72, align 4 399 %add75 = add nsw i32 %y.i, 64 400 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 401 store float %val, float addrspace(3)* %arrayidx76, align 4 402 %add79 = add nsw i32 %y.i, 65 403 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 404 store float %val, float addrspace(3)* %arrayidx80, align 4 405 ret void 406} 407 408; Function Attrs: nounwind readnone 409declare i32 @llvm.r600.read.tgid.x() #1 410 411; Function Attrs: nounwind readnone 412declare i32 @llvm.r600.read.tgid.y() #1 413 414; Function Attrs: nounwind readnone 415declare i32 @llvm.r600.read.tidig.x() #1 416 417; Function Attrs: nounwind readnone 418declare i32 @llvm.r600.read.tidig.y() #1 419 420; Function Attrs: noduplicate nounwind 421declare void @llvm.AMDGPU.barrier.local() #2 422 423attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } 424attributes #1 = { nounwind readnone } 425attributes #2 = { noduplicate nounwind } 426