1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 4; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s 5 6; FUNC-LABEL: {{^}}store_i1: 7; EG: MOVA_INT 8; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 9; EG: MOVA_INT 10; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 11 12; CM: MOVA_INT 13; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 14; CM: MOVA_INT 15; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 16 17; SI: buffer_store_byte 18define amdgpu_kernel void @store_i1(i1 addrspace(5)* %out) { 19entry: 20 store i1 true, i1 addrspace(5)* %out 21 ret void 22} 23 24; i8 store 25; FUNC-LABEL: {{^}}store_i8: 26; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x 27; EG-NEXT: 2 28; EG: MOVA_INT * AR.x (MASKED) 29; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x 30 31; IG 0: Get the byte index and truncate the value 32; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x 33; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x 34; EG-NEXT: 3(4.203895e-45) 35 36 37; EG: LSHL * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], literal.x, PV.W 38; EG-NEXT: 255(3.573311e-43) 39 40; EG: NOT_INT 41; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]] 42; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]] 43; TODO: Is the reload necessary? 44; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]] 45; EG: MOV * T(0 + AR.x).X+, [[RES]] 46 47; SI: buffer_store_byte 48 49define amdgpu_kernel void @store_i8(i8 addrspace(5)* %out, i8 %in) { 50entry: 51 store i8 %in, i8 addrspace(5)* %out 52 ret void 53} 54 55; i16 store 56; FUNC-LABEL: {{^}}store_i16: 57; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x 58; EG-NEXT: 2 59; EG: MOVA_INT * AR.x (MASKED) 60; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x 61 62; EG: VTX_READ_16 63 64; IG 0: Get the byte index and truncate the value 65; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x 66; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x 67; EG-NEXT: 3(4.203895e-45) 68 69; EG: NOT_INT 70; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]] 71; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]] 72; TODO: Is the reload necessary? 73; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]] 74; EG: MOV * T(0 + AR.x).X+, [[RES]] 75 76; SI: buffer_store_short 77define amdgpu_kernel void @store_i16(i16 addrspace(5)* %out, i16 %in) { 78entry: 79 store i16 %in, i16 addrspace(5)* %out 80 ret void 81} 82 83; FUNC-LABEL: {{^}}store_i24: 84; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 85; SI-DAG: buffer_store_byte 86; SI-DAG: buffer_store_short 87 88; EG: MOVA_INT 89; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 90; EG: MOVA_INT 91; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 92; TODO: This load and store can be eliminated 93; EG: MOVA_INT 94; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 95; EG: MOVA_INT 96; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 97 98; CM: MOVA_INT 99; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 100; CM: MOVA_INT 101; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 102; TODO: This load and store can be eliminated 103; CM: MOVA_INT 104; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 105; CM: MOVA_INT 106; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 107define amdgpu_kernel void @store_i24(i24 addrspace(5)* %out, i24 %in) { 108entry: 109 store i24 %in, i24 addrspace(5)* %out 110 ret void 111} 112 113; FUNC-LABEL: {{^}}store_i25: 114; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} 115; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] 116; SI: buffer_store_dword [[VAND]] 117 118; EG: MOVA_INT 119; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 120; EG-NOT: MOVA_INT 121 122; CM: MOVA_INT 123; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 124; CM-NOT: MOVA_INT 125define amdgpu_kernel void @store_i25(i25 addrspace(5)* %out, i25 %in) { 126entry: 127 store i25 %in, i25 addrspace(5)* %out 128 ret void 129} 130 131; FUNC-LABEL: {{^}}store_v2i8: 132; v2i8 is naturally 2B aligned, treat as i16 133; EG: MOVA_INT 134; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 135; EG: MOVA_INT 136; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 137; EG-NOT: MOVA_INT 138 139; CM: MOVA_INT 140; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 141; CM: MOVA_INT 142; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 143; CM-NOT: MOVA_INT 144 145; SI: buffer_store_short 146define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(5)* %out, <2 x i32> %in) { 147entry: 148 %0 = trunc <2 x i32> %in to <2 x i8> 149 store <2 x i8> %0, <2 x i8> addrspace(5)* %out 150 ret void 151} 152 153; FUNC-LABEL: {{^}}store_v2i8_unaligned: 154; EG: MOVA_INT 155; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 156; EG: MOVA_INT 157; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 158; TODO: This load and store cannot be eliminated, 159; they might be different locations 160; EG: MOVA_INT 161; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 162; EG: MOVA_INT 163; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 164 165; CM: MOVA_INT 166; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 167; CM: MOVA_INT 168; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 169; TODO: This load and store cannot be eliminated, 170; they might be different locations 171; CM: MOVA_INT 172; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 173; CM: MOVA_INT 174; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 175 176; SI: buffer_store_byte 177define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(5)* %out, <2 x i32> %in) { 178entry: 179 %0 = trunc <2 x i32> %in to <2 x i8> 180 store <2 x i8> %0, <2 x i8> addrspace(5)* %out, align 1 181 ret void 182} 183 184 185; FUNC-LABEL: {{^}}store_v2i16: 186; v2i8 is naturally 2B aligned, treat as i16 187; EG: MOVA_INT 188; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 189; EG-NOT: MOVA_INT 190 191; CM: MOVA_INT 192; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 193; CM-NOT: MOVA_INT 194 195; SI: buffer_store_dword 196define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(5)* %out, <2 x i32> %in) { 197entry: 198 %0 = trunc <2 x i32> %in to <2 x i16> 199 store <2 x i16> %0, <2 x i16> addrspace(5)* %out 200 ret void 201} 202 203; FUNC-LABEL: {{^}}store_v2i16_unaligned: 204; EG: MOVA_INT 205; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 206; EG: MOVA_INT 207; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 208; TODO: This load and store cannot be eliminated, 209; they might be different locations 210; EG: MOVA_INT 211; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 212; EG: MOVA_INT 213; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 214 215; CM: MOVA_INT 216; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 217; CM: MOVA_INT 218; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 219; TODO: This load and store cannot be eliminated, 220; they might be different locations 221; CM: MOVA_INT 222; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 223; CM: MOVA_INT 224; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 225 226; SI: buffer_store_short 227; SI: buffer_store_short 228define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(5)* %out, <2 x i32> %in) { 229entry: 230 %0 = trunc <2 x i32> %in to <2 x i16> 231 store <2 x i16> %0, <2 x i16> addrspace(5)* %out, align 2 232 ret void 233} 234 235; FUNC-LABEL: {{^}}store_v4i8: 236; EG: MOVA_INT 237; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 238; EG-NOT: MOVA_INT 239 240; CM: MOVA_INT 241; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 242; CM-NOT: MOVA_INT 243 244; SI: buffer_store_dword 245define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(5)* %out, <4 x i32> %in) { 246entry: 247 %0 = trunc <4 x i32> %in to <4 x i8> 248 store <4 x i8> %0, <4 x i8> addrspace(5)* %out 249 ret void 250} 251 252; FUNC-LABEL: {{^}}store_v4i8_unaligned: 253; EG: MOVA_INT 254; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 255; EG: MOVA_INT 256; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 257; TODO: This load and store cannot be eliminated, 258; they might be different locations 259; EG: MOVA_INT 260; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 261; EG: MOVA_INT 262; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 263; TODO: This load and store cannot be eliminated, 264; they might be different locations 265; EG: MOVA_INT 266; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 267; EG: MOVA_INT 268; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 269; TODO: This load and store cannot be eliminated, 270; they might be different locations 271; EG: MOVA_INT 272; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 273; EG: MOVA_INT 274; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 275 276; CM: MOVA_INT 277; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 278; CM: MOVA_INT 279; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 280; TODO: This load and store cannot be eliminated, 281; they might be different locations 282; CM: MOVA_INT 283; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 284; CM: MOVA_INT 285; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 286; TODO: This load and store cannot be eliminated, 287; they might be different locations 288; CM: MOVA_INT 289; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 290; CM: MOVA_INT 291; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 292; TODO: This load and store cannot be eliminated, 293; they might be different locations 294; CM: MOVA_INT 295; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 296; CM: MOVA_INT 297; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 298 299; SI: buffer_store_byte 300; SI: buffer_store_byte 301; SI: buffer_store_byte 302; SI: buffer_store_byte 303; SI-NOT: buffer_store_dword 304define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) { 305entry: 306 %0 = trunc <4 x i32> %in to <4 x i8> 307 store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 1 308 ret void 309} 310 311; FUNC-LABEL: {{^}}store_v8i8_unaligned: 312; EG: MOVA_INT 313; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 314; EG: MOVA_INT 315; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 316; TODO: This load and store cannot be eliminated, 317; they might be different locations 318; EG: MOVA_INT 319; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 320; EG: MOVA_INT 321; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 322; TODO: This load and store cannot be eliminated, 323; they might be different locations 324; EG: MOVA_INT 325; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 326; EG: MOVA_INT 327; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 328; TODO: This load and store cannot be eliminated, 329; they might be different locations 330; EG: MOVA_INT 331; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 332; EG: MOVA_INT 333; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 334; TODO: This load and store cannot be eliminated, 335; they might be different locations 336; EG: MOVA_INT 337; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 338; EG: MOVA_INT 339; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 340; TODO: This load and store cannot be eliminated, 341; they might be different locations 342; EG: MOVA_INT 343; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 344; EG: MOVA_INT 345; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 346; TODO: This load and store cannot be eliminated, 347; they might be different locations 348; EG: MOVA_INT 349; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 350; EG: MOVA_INT 351; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 352; TODO: This load and store cannot be eliminated, 353; they might be different locations 354; EG: MOVA_INT 355; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 356; EG: MOVA_INT 357; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 358 359; CM: MOVA_INT 360; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 361; CM: MOVA_INT 362; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 363; TODO: This load and store cannot be eliminated, 364; they might be different locations 365; CM: MOVA_INT 366; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 367; CM: MOVA_INT 368; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 369; TODO: This load and store cannot be eliminated, 370; they might be different locations 371; CM: MOVA_INT 372; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 373; CM: MOVA_INT 374; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 375; TODO: This load and store cannot be eliminated, 376; they might be different locations 377; CM: MOVA_INT 378; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 379; CM: MOVA_INT 380; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 381; TODO: This load and store cannot be eliminated, 382; they might be different locations 383; CM: MOVA_INT 384; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 385; CM: MOVA_INT 386; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 387; TODO: This load and store cannot be eliminated, 388; they might be different locations 389; CM: MOVA_INT 390; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 391; CM: MOVA_INT 392; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 393; TODO: This load and store cannot be eliminated, 394; they might be different locations 395; CM: MOVA_INT 396; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 397; CM: MOVA_INT 398; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 399; TODO: This load and store cannot be eliminated, 400; they might be different locations 401; CM: MOVA_INT 402; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 403; CM: MOVA_INT 404; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 405 406; SI: buffer_store_byte 407; SI: buffer_store_byte 408; SI: buffer_store_byte 409; SI: buffer_store_byte 410; SI: buffer_store_byte 411; SI: buffer_store_byte 412; SI: buffer_store_byte 413; SI: buffer_store_byte 414; SI-NOT: buffer_store_dword 415define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(5)* %out, <8 x i32> %in) { 416entry: 417 %0 = trunc <8 x i32> %in to <8 x i8> 418 store <8 x i8> %0, <8 x i8> addrspace(5)* %out, align 1 419 ret void 420} 421 422; FUNC-LABEL: {{^}}store_v4i8_halfaligned: 423; EG: MOVA_INT 424; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 425; EG: MOVA_INT 426; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 427; TODO: This load and store cannot be eliminated, 428; they might be different locations 429; EG: MOVA_INT 430; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 431; EG: MOVA_INT 432; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 433 434; CM: MOVA_INT 435; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 436; CM: MOVA_INT 437; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 438; TODO: This load and store cannot be eliminated, 439; they might be different locations 440; CM: MOVA_INT 441; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 442; CM: MOVA_INT 443; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 444 445; SI: buffer_store_short 446; SI: buffer_store_short 447; SI-NOT: buffer_store_dword 448define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) { 449entry: 450 %0 = trunc <4 x i32> %in to <4 x i8> 451 store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 2 452 ret void 453} 454 455; floating-point store 456; FUNC-LABEL: {{^}}store_f32: 457; EG: MOVA_INT 458; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 459 460; CM: MOVA_INT 461; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 462 463; SI: buffer_store_dword 464 465define amdgpu_kernel void @store_f32(float addrspace(5)* %out, float %in) { 466 store float %in, float addrspace(5)* %out 467 ret void 468} 469 470; FUNC-LABEL: {{^}}store_v4i16: 471; EG: MOVA_INT 472; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 473; EG: MOVA_INT 474; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 475 476; CM: MOVA_INT 477; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 478; CM: MOVA_INT 479; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 480 481;TODO: why not x2? 482; XSI: buffer_store_dwordx2 483; SI: buffer_store_dword 484; SI: buffer_store_dword 485define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(5)* %out, <4 x i32> %in) { 486entry: 487 %0 = trunc <4 x i32> %in to <4 x i16> 488 store <4 x i16> %0, <4 x i16> addrspace(5)* %out 489 ret void 490} 491 492; vec2 floating-point stores 493; FUNC-LABEL: {{^}}store_v2f32: 494; EG: MOVA_INT 495; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 496; EG: MOVA_INT 497; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 498 499; CM: MOVA_INT 500; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 501; CM: MOVA_INT 502; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 503 504;TODO: why not x2? 505; XSI: buffer_store_dwordx2 506; SI: buffer_store_dword 507; SI: buffer_store_dword 508 509define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(5)* %out, float %a, float %b) { 510entry: 511 %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0 512 %1 = insertelement <2 x float> %0, float %b, i32 1 513 store <2 x float> %1, <2 x float> addrspace(5)* %out 514 ret void 515} 516 517; FUNC-LABEL: {{^}}store_v3i32: 518; EG: MOVA_INT 519; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 520; EG: MOVA_INT 521; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 522; EG: MOVA_INT 523; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 524 525; CM: MOVA_INT 526; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 527; CM: MOVA_INT 528; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 529; CM: MOVA_INT 530; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 531 532;TODO: why not x2? 533; XSI-DAG: buffer_store_dwordx2 534; SI: buffer_store_dword 535; SI: buffer_store_dword 536; SI: buffer_store_dword 537 538define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(5)* %out, <3 x i32> %a) nounwind { 539 store <3 x i32> %a, <3 x i32> addrspace(5)* %out, align 16 540 ret void 541} 542 543; FUNC-LABEL: {{^}}store_v4i32: 544; EG: MOVA_INT 545; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 546; EG: MOVA_INT 547; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 548; EG: MOVA_INT 549; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 550; EG: MOVA_INT 551; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 552 553; CM: MOVA_INT 554; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 555; CM: MOVA_INT 556; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 557; CM: MOVA_INT 558; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 559; CM: MOVA_INT 560; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 561 562;TODO: why not x4? 563; XSI: buffer_store_dwordx4 564; SI: buffer_store_dword 565; SI: buffer_store_dword 566; SI: buffer_store_dword 567; SI: buffer_store_dword 568define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(5)* %out, <4 x i32> %in) { 569entry: 570 store <4 x i32> %in, <4 x i32> addrspace(5)* %out 571 ret void 572} 573 574; FUNC-LABEL: {{^}}store_v4i32_unaligned: 575; EG: MOVA_INT 576; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 577; EG: MOVA_INT 578; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 579; EG: MOVA_INT 580; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 581; EG: MOVA_INT 582; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 583 584; CM: MOVA_INT 585; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 586; CM: MOVA_INT 587; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 588; CM: MOVA_INT 589; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 590; CM: MOVA_INT 591; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 592 593;TODO: why not x4? 594; XSI: buffer_store_dwordx4 595; SI: buffer_store_dword 596; SI: buffer_store_dword 597; SI: buffer_store_dword 598; SI: buffer_store_dword 599define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(5)* %out, <4 x i32> %in) { 600entry: 601 store <4 x i32> %in, <4 x i32> addrspace(5)* %out, align 4 602 ret void 603} 604 605; v4f32 store 606; FUNC-LABEL: {{^}}store_v4f32: 607; EG: MOVA_INT 608; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 609; EG: MOVA_INT 610; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 611; EG: MOVA_INT 612; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 613; EG: MOVA_INT 614; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 615 616; CM: MOVA_INT 617; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 618; CM: MOVA_INT 619; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 620; CM: MOVA_INT 621; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 622; CM: MOVA_INT 623; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 624 625;TODO: why not x4? 626; XSI: buffer_store_dwordx4 627; SI: buffer_store_dword 628; SI: buffer_store_dword 629; SI: buffer_store_dword 630; SI: buffer_store_dword 631define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(5)* %out, <4 x float> addrspace(5)* %in) { 632 %1 = load <4 x float>, <4 x float> addrspace(5)* %in 633 store <4 x float> %1, <4 x float> addrspace(5)* %out 634 ret void 635} 636 637; FUNC-LABEL: {{^}}store_i64_i8: 638; EG: MOVA_INT 639; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 640; EG: MOVA_INT 641; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 642 643; CM: MOVA_INT 644; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 645; CM: MOVA_INT 646; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 647 648; SI: buffer_store_byte 649define amdgpu_kernel void @store_i64_i8(i8 addrspace(5)* %out, i64 %in) { 650entry: 651 %0 = trunc i64 %in to i8 652 store i8 %0, i8 addrspace(5)* %out 653 ret void 654} 655 656; FUNC-LABEL: {{^}}store_i64_i16: 657; EG: MOVA_INT 658; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 659; EG: MOVA_INT 660; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 661 662; CM: MOVA_INT 663; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, 664; CM: MOVA_INT 665; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 666 667; SI: buffer_store_short 668define amdgpu_kernel void @store_i64_i16(i16 addrspace(5)* %out, i64 %in) { 669entry: 670 %0 = trunc i64 %in to i16 671 store i16 %0, i16 addrspace(5)* %out 672 ret void 673} 674 675; The stores in this function are combined by the optimizer to create a 676; 64-bit store with 32-bit alignment. This is legal and the legalizer 677; should not try to split the 64-bit store back into 2 32-bit stores. 678 679; FUNC-LABEL: {{^}}vecload2: 680; EG: MOVA_INT 681; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 682; EG: MOVA_INT 683; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 684 685; CM: MOVA_INT 686; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 687; CM: MOVA_INT 688; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 689 690;TODO: why not x2? 691; XSI: buffer_store_dwordx2 692; SI: buffer_store_dword 693; SI: buffer_store_dword 694define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 { 695entry: 696 %0 = load i32, i32 addrspace(4)* %mem, align 4 697 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1 698 %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4 699 store i32 %0, i32 addrspace(5)* %out, align 4 700 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1 701 store i32 %1, i32 addrspace(5)* %arrayidx1, align 4 702 ret void 703} 704 705; When i128 was a legal type this program generated cannot select errors: 706 707; FUNC-LABEL: {{^}}"i128-const-store": 708; EG: MOVA_INT 709; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 710; EG: MOVA_INT 711; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 712; EG: MOVA_INT 713; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 714; EG: MOVA_INT 715; EG: MOV {{[\* ]*}}T(0 + AR.x).X+, 716 717; CM: MOVA_INT 718; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 719; CM: MOVA_INT 720; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 721; CM: MOVA_INT 722; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 723; CM: MOVA_INT 724; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, 725 726;TODO: why not x4? 727; XSI: buffer_store_dwordx4 728; SI: buffer_store_dword 729; SI: buffer_store_dword 730; SI: buffer_store_dword 731; SI: buffer_store_dword 732define amdgpu_kernel void @i128-const-store(i32 addrspace(5)* %out) { 733entry: 734 store i32 1, i32 addrspace(5)* %out, align 4 735 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1 736 store i32 1, i32 addrspace(5)* %arrayidx2, align 4 737 %arrayidx4 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 2 738 store i32 2, i32 addrspace(5)* %arrayidx4, align 4 739 %arrayidx6 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 3 740 store i32 2, i32 addrspace(5)* %arrayidx6, align 4 741 ret void 742} 743 744 745attributes #0 = { nounwind } 746