1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=FUNC %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=FUNC %s 4; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 5; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s 6 7; FUNC-LABEL: {{^}}store_i1: 8; EG: MEM_RAT MSKOR 9; EG-NOT: MEM_RAT MSKOR 10 11; CM: MEM_RAT MSKOR 12; CM-NOT: MEM_RAT MSKOR 13 14; SIVI: buffer_store_byte 15; GFX9: global_store_byte 16define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) { 17entry: 18 store i1 true, i1 addrspace(1)* %out 19 ret void 20} 21 22; i8 store 23; FUNC-LABEL: {{^}}store_i8: 24; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 25; EG-NOT: MEM_RAT MSKOR 26 27; EG: VTX_READ_8 28; EG: AND_INT 29; EG: AND_INT 30; EG: LSHL 31; EG: LSHL 32; EG: LSHL 33 34; SIVI: buffer_store_byte 35; GFX9: global_store_byte 36define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) { 37entry: 38 store i8 %in, i8 addrspace(1)* %out 39 ret void 40} 41 42; i16 store 43; FUNC-LABEL: {{^}}store_i16: 44; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 45; EG-NOT: MEM_RAT MSKOR 46 47; EG: VTX_READ_16 48; EG: AND_INT 49; EG: AND_INT 50; EG: LSHL 51; EG: LSHL 52; EG: LSHL 53 54 55; SIVI: buffer_store_short 56; GFX9: global_store_short 57define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) { 58entry: 59 store i16 %in, i16 addrspace(1)* %out 60 ret void 61} 62 63; FUNC-LABEL: {{^}}store_i24: 64; SIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 65; SIVI-DAG: buffer_store_byte 66; SIVI-DAG: buffer_store_short 67 68; GFX9-DAG: global_store_byte_d16_hi v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:2 69; GFX9-DAG: global_store_short 70 71; EG: MEM_RAT MSKOR 72; EG: MEM_RAT MSKOR 73define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) { 74entry: 75 store i24 %in, i24 addrspace(1)* %out 76 ret void 77} 78 79; FUNC-LABEL: {{^}}store_i25: 80; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} 81; GCN: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] 82; SIVI: buffer_store_dword [[VAND]] 83; GFX9: global_store_dword v{{[0-9]+}}, [[VAND]], s 84 85; EG: MEM_RAT_CACHELESS STORE_RAW 86; EG-NOT: MEM_RAT 87 88; CM: MEM_RAT_CACHELESS STORE_DWORD 89; CM-NOT: MEM_RAT 90define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) { 91entry: 92 store i25 %in, i25 addrspace(1)* %out 93 ret void 94} 95 96; FUNC-LABEL: {{^}}store_v2i8: 97; v2i8 is naturally 2B aligned 98; EG: MEM_RAT MSKOR 99; EG-NOT: MEM_RAT MSKOR 100 101; CM: MEM_RAT MSKOR 102; CM-NOT: MEM_RAT MSKOR 103 104; SIVI: buffer_store_short 105; GFX9: global_store_short 106define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { 107entry: 108 %0 = trunc <2 x i32> %in to <2 x i8> 109 store <2 x i8> %0, <2 x i8> addrspace(1)* %out 110 ret void 111} 112 113; FUNC-LABEL: {{^}}store_v2i8_unaligned: 114; EG: MEM_RAT MSKOR 115; EG: MEM_RAT MSKOR 116; EG-NOT: MEM_RAT MSKOR 117 118; CM: MEM_RAT MSKOR 119; CM: MEM_RAT MSKOR 120; CM-NOT: MEM_RAT MSKOR 121 122; SI: buffer_store_byte 123define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { 124entry: 125 %0 = trunc <2 x i32> %in to <2 x i8> 126 store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1 127 ret void 128} 129 130 131; FUNC-LABEL: {{^}}store_v2i16: 132; EG: MEM_RAT_CACHELESS STORE_RAW 133 134; CM: MEM_RAT_CACHELESS STORE_DWORD 135 136; SIVI: buffer_store_dword 137; GFX9: global_store_dword 138define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { 139entry: 140 %0 = trunc <2 x i32> %in to <2 x i16> 141 store <2 x i16> %0, <2 x i16> addrspace(1)* %out 142 ret void 143} 144 145; FUNC-LABEL: {{^}}store_v2i16_unaligned: 146; EG: MEM_RAT MSKOR 147; EG: MEM_RAT MSKOR 148; EG-NOT: MEM_RAT MSKOR 149; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 150 151; CM: MEM_RAT MSKOR 152; CM: MEM_RAT MSKOR 153; CM-NOT: MEM_RAT MSKOR 154; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 155 156; SIVI: buffer_store_short 157; SIVI: buffer_store_short 158 159; GFX9: global_store_short 160; GFX9: global_store_short 161define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { 162entry: 163 %0 = trunc <2 x i32> %in to <2 x i16> 164 store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2 165 ret void 166} 167 168; FUNC-LABEL: {{^}}store_v4i8: 169; EG: MEM_RAT_CACHELESS STORE_RAW 170 171; CM: MEM_RAT_CACHELESS STORE_DWORD 172 173; SIVI: buffer_store_dword 174; GFX9: global_store_dword 175define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 176entry: 177 %0 = trunc <4 x i32> %in to <4 x i8> 178 store <4 x i8> %0, <4 x i8> addrspace(1)* %out 179 ret void 180} 181 182; FUNC-LABEL: {{^}}store_v4i8_unaligned: 183; EG: MEM_RAT MSKOR 184; EG: MEM_RAT MSKOR 185; EG: MEM_RAT MSKOR 186; EG: MEM_RAT MSKOR 187; EG-NOT: MEM_RAT MSKOR 188; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 189 190; CM: MEM_RAT MSKOR 191; CM: MEM_RAT MSKOR 192; CM: MEM_RAT MSKOR 193; CM: MEM_RAT MSKOR 194; CM-NOT: MEM_RAT MSKOR 195; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 196 197; SI: buffer_store_byte 198; SI: buffer_store_byte 199; SI: buffer_store_byte 200; SI: buffer_store_byte 201; SI-NOT: buffer_store_dword 202define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 203entry: 204 %0 = trunc <4 x i32> %in to <4 x i8> 205 store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1 206 ret void 207} 208 209; FUNC-LABEL: {{^}}store_v4i8_halfaligned: 210; EG: MEM_RAT MSKOR 211; EG: MEM_RAT MSKOR 212; EG-NOT: MEM_RAT MSKOR 213; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 214 215; CM: MEM_RAT MSKOR 216; CM: MEM_RAT MSKOR 217; CM-NOT: MEM_RAT MSKOR 218; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 219 220; SI: buffer_store_short 221; SI: buffer_store_short 222; SI-NOT: buffer_store_dword 223define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 224entry: 225 %0 = trunc <4 x i32> %in to <4 x i8> 226 store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2 227 ret void 228} 229 230; floating-point store 231; FUNC-LABEL: {{^}}store_f32: 232; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 233 234; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} 235 236; SIVI: buffer_store_dword 237; GFX9: global_store_dword 238 239define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) { 240 store float %in, float addrspace(1)* %out 241 ret void 242} 243 244; FUNC-LABEL: {{^}}store_v4i16: 245; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY 246 247; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}} 248 249; SIVI: buffer_store_dwordx2 250; GFX9: global_store_dwordx2 251define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { 252entry: 253 %0 = trunc <4 x i32> %in to <4 x i16> 254 store <4 x i16> %0, <4 x i16> addrspace(1)* %out 255 ret void 256} 257 258; vec2 floating-point stores 259; FUNC-LABEL: {{^}}store_v2f32: 260; EG: MEM_RAT_CACHELESS STORE_RAW 261 262; CM: MEM_RAT_CACHELESS STORE_DWORD 263 264; SIVI: buffer_store_dwordx2 265; GFX9: global_store_dwordx2 266 267define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { 268entry: 269 %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0 270 %1 = insertelement <2 x float> %0, float %b, i32 1 271 store <2 x float> %1, <2 x float> addrspace(1)* %out 272 ret void 273} 274 275; FUNC-LABEL: {{^}}store_v3i32: 276; SI-DAG: buffer_store_dword v 277; SI-DAG: buffer_store_dwordx2 278 279; VI: buffer_store_dwordx3 280 281; GFX9: global_store_dwordx3 282 283; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 284; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}}, 285define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { 286 store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16 287 ret void 288} 289 290; FUNC-LABEL: {{^}}store_v4i32: 291; EG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XYZW}} 292; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 293 294; CM: MEM_RAT_CACHELESS STORE_DWORD 295; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 296 297; SIVI: buffer_store_dwordx4 298; GFX9: global_store_dwordx4 299define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { 300entry: 301 store <4 x i32> %in, <4 x i32> addrspace(1)* %out 302 ret void 303} 304 305; FUNC-LABEL: {{^}}store_v4i32_unaligned: 306; EG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XYZW}} 307; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 308 309; CM: MEM_RAT_CACHELESS STORE_DWORD 310; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 311 312; SIVI: buffer_store_dwordx4 313; GFX9: global_store_dwordx4 314define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { 315entry: 316 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 317 ret void 318} 319 320; v4f32 store 321; FUNC-LABEL: {{^}}store_v4f32: 322; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 323; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 324 325; CM: MEM_RAT_CACHELESS STORE_DWORD 326; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 327 328; SIVI: buffer_store_dwordx4 329; GFX9: global_store_dwordx4 330define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { 331 %1 = load <4 x float>, <4 x float> addrspace(1) * %in 332 store <4 x float> %1, <4 x float> addrspace(1)* %out 333 ret void 334} 335 336; FUNC-LABEL: {{^}}store_i64_i8: 337; EG: MEM_RAT MSKOR 338 339; CM: MEM_RAT MSKOR 340 341; SIVI: buffer_store_byte 342; GFX9: global_store_byte 343define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { 344entry: 345 %0 = trunc i64 %in to i8 346 store i8 %0, i8 addrspace(1)* %out 347 ret void 348} 349 350; FUNC-LABEL: {{^}}store_i64_i16: 351; EG: MEM_RAT MSKOR 352; SIVI: buffer_store_short 353; GFX9: global_store_short 354define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { 355entry: 356 %0 = trunc i64 %in to i16 357 store i16 %0, i16 addrspace(1)* %out 358 ret void 359} 360 361; The stores in this function are combined by the optimizer to create a 362; 64-bit store with 32-bit alignment. This is legal and the legalizer 363; should not try to split the 64-bit store back into 2 32-bit stores. 364 365; FUNC-LABEL: {{^}}vecload2: 366; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XY, T[0-9]+\.X}}, 1 367; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 368 369; CM: MEM_RAT_CACHELESS STORE_DWORD 370; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 371 372; SIVI: buffer_store_dwordx2 373; GFX9: global_store_dwordx2 374define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 { 375entry: 376 %0 = load i32, i32 addrspace(4)* %mem, align 4 377 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1 378 %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4 379 store i32 %0, i32 addrspace(1)* %out, align 4 380 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 381 store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 382 ret void 383} 384 385; When i128 was a legal type this program generated cannot select errors: 386 387; FUNC-LABEL: {{^}}"i128-const-store": 388; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 1 389 390; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X 391 392; SIVI: buffer_store_dwordx4 393; GFX9: global_store_dwordx4 394define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) { 395entry: 396 store i32 1, i32 addrspace(1)* %out, align 4 397 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 398 store i32 1, i32 addrspace(1)* %arrayidx2, align 4 399 %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 400 store i32 2, i32 addrspace(1)* %arrayidx4, align 4 401 %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 402 store i32 2, i32 addrspace(1)* %arrayidx6, align 4 403 ret void 404} 405 406attributes #0 = { nounwind } 407