1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s 5 6define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 7; GFX9-LABEL: store_lds_v4i32: 8; GFX9: ; %bb.0: 9; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 10; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 11; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12; GFX9-NEXT: v_mov_b32_e32 v4, s4 13; GFX9-NEXT: v_mov_b32_e32 v0, s0 14; GFX9-NEXT: v_mov_b32_e32 v1, s1 15; GFX9-NEXT: v_mov_b32_e32 v2, s2 16; GFX9-NEXT: v_mov_b32_e32 v3, s3 17; GFX9-NEXT: ds_write_b128 v4, v[0:3] 18; GFX9-NEXT: s_endpgm 19; 20; GFX7-LABEL: store_lds_v4i32: 21; GFX7: ; %bb.0: 22; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 23; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 24; GFX7-NEXT: s_mov_b32 m0, -1 25; GFX7-NEXT: s_waitcnt lgkmcnt(0) 26; GFX7-NEXT: v_mov_b32_e32 v4, s4 27; GFX7-NEXT: v_mov_b32_e32 v0, s0 28; GFX7-NEXT: v_mov_b32_e32 v1, s1 29; GFX7-NEXT: v_mov_b32_e32 v2, s2 30; GFX7-NEXT: v_mov_b32_e32 v3, s3 31; GFX7-NEXT: ds_write_b128 v4, v[0:3] 32; GFX7-NEXT: s_endpgm 33; 34; GFX6-LABEL: store_lds_v4i32: 35; GFX6: ; %bb.0: 36; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 37; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 38; GFX6-NEXT: s_mov_b32 m0, -1 39; GFX6-NEXT: s_waitcnt lgkmcnt(0) 40; GFX6-NEXT: v_mov_b32_e32 v4, s4 41; GFX6-NEXT: v_mov_b32_e32 v0, s2 42; GFX6-NEXT: v_mov_b32_e32 v1, s3 43; GFX6-NEXT: v_mov_b32_e32 v2, s0 44; GFX6-NEXT: v_mov_b32_e32 v3, s1 45; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 46; GFX6-NEXT: s_endpgm 47 store <4 x i32> %x, <4 x i32> addrspace(3)* %out 48 ret void 49} 50 51define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 52; GFX9-LABEL: store_lds_v4i32_align1: 53; GFX9: ; %bb.0: 54; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 55; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 56; GFX9-NEXT: s_waitcnt lgkmcnt(0) 57; GFX9-NEXT: v_mov_b32_e32 v0, s4 58; GFX9-NEXT: v_mov_b32_e32 v1, s3 59; GFX9-NEXT: v_mov_b32_e32 v2, s2 60; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 61; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 62; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 63; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 64; GFX9-NEXT: v_mov_b32_e32 v1, s1 65; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 66; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 67; GFX9-NEXT: v_mov_b32_e32 v1, s0 68; GFX9-NEXT: s_lshr_b32 s4, s3, 8 69; GFX9-NEXT: ds_write_b8 v0, v1 70; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 71; GFX9-NEXT: v_mov_b32_e32 v1, s4 72; GFX9-NEXT: s_lshr_b32 s3, s3, 24 73; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 74; GFX9-NEXT: v_mov_b32_e32 v1, s3 75; GFX9-NEXT: s_lshr_b32 s3, s2, 8 76; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 77; GFX9-NEXT: v_mov_b32_e32 v1, s3 78; GFX9-NEXT: s_lshr_b32 s2, s2, 24 79; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 80; GFX9-NEXT: v_mov_b32_e32 v1, s2 81; GFX9-NEXT: s_lshr_b32 s2, s1, 8 82; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 83; GFX9-NEXT: v_mov_b32_e32 v1, s2 84; GFX9-NEXT: s_lshr_b32 s1, s1, 24 85; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 86; GFX9-NEXT: v_mov_b32_e32 v1, s1 87; GFX9-NEXT: s_lshr_b32 s1, s0, 8 88; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 89; GFX9-NEXT: v_mov_b32_e32 v1, s1 90; GFX9-NEXT: s_lshr_b32 s0, s0, 24 91; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 92; GFX9-NEXT: v_mov_b32_e32 v1, s0 93; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 94; GFX9-NEXT: s_endpgm 95; 96; GFX7-LABEL: store_lds_v4i32_align1: 97; GFX7: ; %bb.0: 98; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 99; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 100; GFX7-NEXT: s_mov_b32 m0, -1 101; GFX7-NEXT: s_waitcnt lgkmcnt(0) 102; GFX7-NEXT: v_mov_b32_e32 v0, s4 103; GFX7-NEXT: v_mov_b32_e32 v1, s3 104; GFX7-NEXT: v_mov_b32_e32 v2, s2 105; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 106; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 107; GFX7-NEXT: v_mov_b32_e32 v1, s1 108; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 109; GFX7-NEXT: v_mov_b32_e32 v1, s0 110; GFX7-NEXT: s_lshr_b32 s4, s3, 8 111; GFX7-NEXT: ds_write_b8 v0, v1 112; GFX7-NEXT: v_mov_b32_e32 v1, s4 113; GFX7-NEXT: s_lshr_b32 s4, s3, 24 114; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 115; GFX7-NEXT: v_mov_b32_e32 v1, s4 116; GFX7-NEXT: s_lshr_b32 s3, s3, 16 117; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 118; GFX7-NEXT: v_mov_b32_e32 v1, s3 119; GFX7-NEXT: s_lshr_b32 s3, s2, 8 120; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 121; GFX7-NEXT: v_mov_b32_e32 v1, s3 122; GFX7-NEXT: s_lshr_b32 s3, s2, 24 123; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 124; GFX7-NEXT: v_mov_b32_e32 v1, s3 125; GFX7-NEXT: s_lshr_b32 s2, s2, 16 126; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 127; GFX7-NEXT: v_mov_b32_e32 v1, s2 128; GFX7-NEXT: s_lshr_b32 s2, s1, 8 129; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 130; GFX7-NEXT: v_mov_b32_e32 v1, s2 131; GFX7-NEXT: s_lshr_b32 s2, s1, 24 132; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 133; GFX7-NEXT: v_mov_b32_e32 v1, s2 134; GFX7-NEXT: s_lshr_b32 s1, s1, 16 135; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 136; GFX7-NEXT: v_mov_b32_e32 v1, s1 137; GFX7-NEXT: s_lshr_b32 s1, s0, 8 138; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 139; GFX7-NEXT: v_mov_b32_e32 v1, s1 140; GFX7-NEXT: s_lshr_b32 s1, s0, 24 141; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 142; GFX7-NEXT: v_mov_b32_e32 v1, s1 143; GFX7-NEXT: s_lshr_b32 s0, s0, 16 144; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 145; GFX7-NEXT: v_mov_b32_e32 v1, s0 146; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 147; GFX7-NEXT: s_endpgm 148; 149; GFX6-LABEL: store_lds_v4i32_align1: 150; GFX6: ; %bb.0: 151; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 152; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 153; GFX6-NEXT: s_mov_b32 m0, -1 154; GFX6-NEXT: s_waitcnt lgkmcnt(0) 155; GFX6-NEXT: v_mov_b32_e32 v0, s4 156; GFX6-NEXT: v_mov_b32_e32 v1, s3 157; GFX6-NEXT: v_mov_b32_e32 v2, s2 158; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 159; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 160; GFX6-NEXT: v_mov_b32_e32 v1, s1 161; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 162; GFX6-NEXT: v_mov_b32_e32 v1, s0 163; GFX6-NEXT: s_lshr_b32 s4, s3, 8 164; GFX6-NEXT: ds_write_b8 v0, v1 165; GFX6-NEXT: v_mov_b32_e32 v1, s4 166; GFX6-NEXT: s_lshr_b32 s4, s3, 24 167; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 168; GFX6-NEXT: v_mov_b32_e32 v1, s4 169; GFX6-NEXT: s_lshr_b32 s3, s3, 16 170; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 171; GFX6-NEXT: v_mov_b32_e32 v1, s3 172; GFX6-NEXT: s_lshr_b32 s3, s2, 8 173; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 174; GFX6-NEXT: v_mov_b32_e32 v1, s3 175; GFX6-NEXT: s_lshr_b32 s3, s2, 24 176; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 177; GFX6-NEXT: v_mov_b32_e32 v1, s3 178; GFX6-NEXT: s_lshr_b32 s2, s2, 16 179; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 180; GFX6-NEXT: v_mov_b32_e32 v1, s2 181; GFX6-NEXT: s_lshr_b32 s2, s1, 8 182; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 183; GFX6-NEXT: v_mov_b32_e32 v1, s2 184; GFX6-NEXT: s_lshr_b32 s2, s1, 24 185; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 186; GFX6-NEXT: v_mov_b32_e32 v1, s2 187; GFX6-NEXT: s_lshr_b32 s1, s1, 16 188; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 189; GFX6-NEXT: v_mov_b32_e32 v1, s1 190; GFX6-NEXT: s_lshr_b32 s1, s0, 8 191; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 192; GFX6-NEXT: v_mov_b32_e32 v1, s1 193; GFX6-NEXT: s_lshr_b32 s1, s0, 24 194; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 195; GFX6-NEXT: v_mov_b32_e32 v1, s1 196; GFX6-NEXT: s_lshr_b32 s0, s0, 16 197; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 198; GFX6-NEXT: v_mov_b32_e32 v1, s0 199; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 200; GFX6-NEXT: s_endpgm 201 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 202 ret void 203} 204 205define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 206; GFX9-LABEL: store_lds_v4i32_align2: 207; GFX9: ; %bb.0: 208; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 209; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 210; GFX9-NEXT: s_waitcnt lgkmcnt(0) 211; GFX9-NEXT: v_mov_b32_e32 v0, s4 212; GFX9-NEXT: v_mov_b32_e32 v1, s3 213; GFX9-NEXT: v_mov_b32_e32 v2, s2 214; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 215; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 216; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 217; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 218; GFX9-NEXT: v_mov_b32_e32 v1, s1 219; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 220; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 221; GFX9-NEXT: v_mov_b32_e32 v1, s0 222; GFX9-NEXT: ds_write_b16 v0, v1 223; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 224; GFX9-NEXT: s_endpgm 225; 226; GFX7-LABEL: store_lds_v4i32_align2: 227; GFX7: ; %bb.0: 228; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 229; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 230; GFX7-NEXT: s_mov_b32 m0, -1 231; GFX7-NEXT: s_waitcnt lgkmcnt(0) 232; GFX7-NEXT: v_mov_b32_e32 v0, s4 233; GFX7-NEXT: v_mov_b32_e32 v1, s3 234; GFX7-NEXT: v_mov_b32_e32 v2, s2 235; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 236; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 237; GFX7-NEXT: v_mov_b32_e32 v1, s1 238; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 239; GFX7-NEXT: v_mov_b32_e32 v1, s0 240; GFX7-NEXT: s_lshr_b32 s3, s3, 16 241; GFX7-NEXT: ds_write_b16 v0, v1 242; GFX7-NEXT: v_mov_b32_e32 v1, s3 243; GFX7-NEXT: s_lshr_b32 s2, s2, 16 244; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 245; GFX7-NEXT: v_mov_b32_e32 v1, s2 246; GFX7-NEXT: s_lshr_b32 s1, s1, 16 247; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 248; GFX7-NEXT: v_mov_b32_e32 v1, s1 249; GFX7-NEXT: s_lshr_b32 s0, s0, 16 250; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 251; GFX7-NEXT: v_mov_b32_e32 v1, s0 252; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 253; GFX7-NEXT: s_endpgm 254; 255; GFX6-LABEL: store_lds_v4i32_align2: 256; GFX6: ; %bb.0: 257; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 258; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 259; GFX6-NEXT: s_mov_b32 m0, -1 260; GFX6-NEXT: s_waitcnt lgkmcnt(0) 261; GFX6-NEXT: v_mov_b32_e32 v0, s4 262; GFX6-NEXT: v_mov_b32_e32 v1, s3 263; GFX6-NEXT: v_mov_b32_e32 v2, s2 264; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 265; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 266; GFX6-NEXT: v_mov_b32_e32 v1, s1 267; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 268; GFX6-NEXT: v_mov_b32_e32 v1, s0 269; GFX6-NEXT: s_lshr_b32 s3, s3, 16 270; GFX6-NEXT: ds_write_b16 v0, v1 271; GFX6-NEXT: v_mov_b32_e32 v1, s3 272; GFX6-NEXT: s_lshr_b32 s2, s2, 16 273; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 274; GFX6-NEXT: v_mov_b32_e32 v1, s2 275; GFX6-NEXT: s_lshr_b32 s1, s1, 16 276; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 277; GFX6-NEXT: v_mov_b32_e32 v1, s1 278; GFX6-NEXT: s_lshr_b32 s0, s0, 16 279; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 280; GFX6-NEXT: v_mov_b32_e32 v1, s0 281; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 282; GFX6-NEXT: s_endpgm 283 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 284 ret void 285} 286 287define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 288; GFX9-LABEL: store_lds_v4i32_align4: 289; GFX9: ; %bb.0: 290; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 291; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 292; GFX9-NEXT: s_waitcnt lgkmcnt(0) 293; GFX9-NEXT: v_mov_b32_e32 v0, s4 294; GFX9-NEXT: v_mov_b32_e32 v1, s0 295; GFX9-NEXT: v_mov_b32_e32 v2, s1 296; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 297; GFX9-NEXT: v_mov_b32_e32 v3, s2 298; GFX9-NEXT: v_mov_b32_e32 v1, s3 299; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 300; GFX9-NEXT: s_endpgm 301; 302; GFX7-LABEL: store_lds_v4i32_align4: 303; GFX7: ; %bb.0: 304; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 305; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 306; GFX7-NEXT: s_mov_b32 m0, -1 307; GFX7-NEXT: s_waitcnt lgkmcnt(0) 308; GFX7-NEXT: v_mov_b32_e32 v0, s4 309; GFX7-NEXT: v_mov_b32_e32 v1, s0 310; GFX7-NEXT: v_mov_b32_e32 v2, s1 311; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 312; GFX7-NEXT: v_mov_b32_e32 v1, s2 313; GFX7-NEXT: v_mov_b32_e32 v2, s3 314; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 315; GFX7-NEXT: s_endpgm 316; 317; GFX6-LABEL: store_lds_v4i32_align4: 318; GFX6: ; %bb.0: 319; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 320; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 321; GFX6-NEXT: s_mov_b32 m0, -1 322; GFX6-NEXT: s_waitcnt lgkmcnt(0) 323; GFX6-NEXT: v_mov_b32_e32 v0, s4 324; GFX6-NEXT: v_mov_b32_e32 v1, s1 325; GFX6-NEXT: v_mov_b32_e32 v2, s0 326; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 327; GFX6-NEXT: v_mov_b32_e32 v1, s3 328; GFX6-NEXT: v_mov_b32_e32 v2, s2 329; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset0:2 offset1:3 330; GFX6-NEXT: s_endpgm 331 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 332 ret void 333} 334 335define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 336; GFX9-LABEL: store_lds_v4i32_align8: 337; GFX9: ; %bb.0: 338; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 339; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 340; GFX9-NEXT: s_waitcnt lgkmcnt(0) 341; GFX9-NEXT: v_mov_b32_e32 v4, s4 342; GFX9-NEXT: v_mov_b32_e32 v0, s0 343; GFX9-NEXT: v_mov_b32_e32 v1, s1 344; GFX9-NEXT: v_mov_b32_e32 v2, s2 345; GFX9-NEXT: v_mov_b32_e32 v3, s3 346; GFX9-NEXT: ds_write_b128 v4, v[0:3] 347; GFX9-NEXT: s_endpgm 348; 349; GFX7-LABEL: store_lds_v4i32_align8: 350; GFX7: ; %bb.0: 351; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 352; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 353; GFX7-NEXT: s_mov_b32 m0, -1 354; GFX7-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7-NEXT: v_mov_b32_e32 v4, s4 356; GFX7-NEXT: v_mov_b32_e32 v0, s0 357; GFX7-NEXT: v_mov_b32_e32 v2, s2 358; GFX7-NEXT: v_mov_b32_e32 v1, s1 359; GFX7-NEXT: v_mov_b32_e32 v3, s3 360; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 361; GFX7-NEXT: s_endpgm 362; 363; GFX6-LABEL: store_lds_v4i32_align8: 364; GFX6: ; %bb.0: 365; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 366; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 367; GFX6-NEXT: s_mov_b32 m0, -1 368; GFX6-NEXT: s_waitcnt lgkmcnt(0) 369; GFX6-NEXT: v_mov_b32_e32 v4, s4 370; GFX6-NEXT: v_mov_b32_e32 v0, s2 371; GFX6-NEXT: v_mov_b32_e32 v1, s3 372; GFX6-NEXT: v_mov_b32_e32 v2, s0 373; GFX6-NEXT: v_mov_b32_e32 v3, s1 374; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 375; GFX6-NEXT: s_endpgm 376 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 377 ret void 378} 379 380define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 381; GFX9-LABEL: store_lds_v4i32_align16: 382; GFX9: ; %bb.0: 383; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 384; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 385; GFX9-NEXT: s_waitcnt lgkmcnt(0) 386; GFX9-NEXT: v_mov_b32_e32 v4, s4 387; GFX9-NEXT: v_mov_b32_e32 v0, s0 388; GFX9-NEXT: v_mov_b32_e32 v1, s1 389; GFX9-NEXT: v_mov_b32_e32 v2, s2 390; GFX9-NEXT: v_mov_b32_e32 v3, s3 391; GFX9-NEXT: ds_write_b128 v4, v[0:3] 392; GFX9-NEXT: s_endpgm 393; 394; GFX7-LABEL: store_lds_v4i32_align16: 395; GFX7: ; %bb.0: 396; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 397; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 398; GFX7-NEXT: s_mov_b32 m0, -1 399; GFX7-NEXT: s_waitcnt lgkmcnt(0) 400; GFX7-NEXT: v_mov_b32_e32 v4, s4 401; GFX7-NEXT: v_mov_b32_e32 v0, s0 402; GFX7-NEXT: v_mov_b32_e32 v1, s1 403; GFX7-NEXT: v_mov_b32_e32 v2, s2 404; GFX7-NEXT: v_mov_b32_e32 v3, s3 405; GFX7-NEXT: ds_write_b128 v4, v[0:3] 406; GFX7-NEXT: s_endpgm 407; 408; GFX6-LABEL: store_lds_v4i32_align16: 409; GFX6: ; %bb.0: 410; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 411; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 412; GFX6-NEXT: s_mov_b32 m0, -1 413; GFX6-NEXT: s_waitcnt lgkmcnt(0) 414; GFX6-NEXT: v_mov_b32_e32 v4, s4 415; GFX6-NEXT: v_mov_b32_e32 v0, s2 416; GFX6-NEXT: v_mov_b32_e32 v1, s3 417; GFX6-NEXT: v_mov_b32_e32 v2, s0 418; GFX6-NEXT: v_mov_b32_e32 v3, s1 419; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 420; GFX6-NEXT: s_endpgm 421 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 422 ret void 423} 424