1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s 4 5; FIXME: 6; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s 7 8define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 9; GFX9-LABEL: store_lds_v4i32: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 12; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: v_mov_b32_e32 v4, s4 15; GFX9-NEXT: v_mov_b32_e32 v0, s0 16; GFX9-NEXT: v_mov_b32_e32 v1, s1 17; GFX9-NEXT: v_mov_b32_e32 v2, s2 18; GFX9-NEXT: v_mov_b32_e32 v3, s3 19; GFX9-NEXT: ds_write_b128 v4, v[0:3] 20; GFX9-NEXT: s_endpgm 21; 22; GFX7-LABEL: store_lds_v4i32: 23; GFX7: ; %bb.0: 24; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 25; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 26; GFX7-NEXT: s_mov_b32 m0, -1 27; GFX7-NEXT: s_waitcnt lgkmcnt(0) 28; GFX7-NEXT: v_mov_b32_e32 v4, s4 29; GFX7-NEXT: v_mov_b32_e32 v0, s0 30; GFX7-NEXT: v_mov_b32_e32 v1, s1 31; GFX7-NEXT: v_mov_b32_e32 v2, s2 32; GFX7-NEXT: v_mov_b32_e32 v3, s3 33; GFX7-NEXT: ds_write_b128 v4, v[0:3] 34; GFX7-NEXT: s_endpgm 35 store <4 x i32> %x, <4 x i32> addrspace(3)* %out 36 ret void 37} 38 39define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 40; GFX9-LABEL: store_lds_v4i32_align1: 41; GFX9: ; %bb.0: 42; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 43; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 44; GFX9-NEXT: s_waitcnt lgkmcnt(0) 45; GFX9-NEXT: v_mov_b32_e32 v1, s4 46; GFX9-NEXT: v_mov_b32_e32 v0, s0 47; GFX9-NEXT: s_lshr_b32 s5, s0, 8 48; GFX9-NEXT: ds_write_b8 v1, v0 49; GFX9-NEXT: v_mov_b32_e32 v0, s5 50; GFX9-NEXT: s_lshr_b32 s6, s0, 16 51; GFX9-NEXT: s_lshr_b32 s7, s0, 24 52; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 53; GFX9-NEXT: v_mov_b32_e32 v0, s6 54; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 55; GFX9-NEXT: v_mov_b32_e32 v0, s7 56; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 57; GFX9-NEXT: v_mov_b32_e32 v0, s1 58; GFX9-NEXT: s_lshr_b32 s0, s1, 8 59; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 60; GFX9-NEXT: v_mov_b32_e32 v0, s0 61; GFX9-NEXT: s_lshr_b32 s4, s1, 16 62; GFX9-NEXT: s_lshr_b32 s5, s1, 24 63; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 64; GFX9-NEXT: v_mov_b32_e32 v0, s4 65; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 66; GFX9-NEXT: v_mov_b32_e32 v0, s5 67; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 68; GFX9-NEXT: v_mov_b32_e32 v0, s2 69; GFX9-NEXT: s_lshr_b32 s0, s2, 8 70; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 71; GFX9-NEXT: v_mov_b32_e32 v0, s0 72; GFX9-NEXT: s_lshr_b32 s1, s2, 16 73; GFX9-NEXT: s_lshr_b32 s4, s2, 24 74; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 75; GFX9-NEXT: v_mov_b32_e32 v0, s1 76; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 77; GFX9-NEXT: v_mov_b32_e32 v0, s4 78; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 79; GFX9-NEXT: v_mov_b32_e32 v0, s3 80; GFX9-NEXT: s_lshr_b32 s0, s3, 8 81; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 82; GFX9-NEXT: v_mov_b32_e32 v0, s0 83; GFX9-NEXT: s_lshr_b32 s1, s3, 16 84; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 85; GFX9-NEXT: v_mov_b32_e32 v0, s1 86; GFX9-NEXT: s_lshr_b32 s2, s3, 24 87; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 88; GFX9-NEXT: v_mov_b32_e32 v0, s2 89; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 90; GFX9-NEXT: s_endpgm 91; 92; GFX7-LABEL: store_lds_v4i32_align1: 93; GFX7: ; %bb.0: 94; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 95; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 96; GFX7-NEXT: s_mov_b32 m0, -1 97; GFX7-NEXT: s_waitcnt lgkmcnt(0) 98; GFX7-NEXT: v_mov_b32_e32 v1, s4 99; GFX7-NEXT: v_mov_b32_e32 v0, s0 100; GFX7-NEXT: s_lshr_b32 s5, s0, 8 101; GFX7-NEXT: ds_write_b8 v1, v0 102; GFX7-NEXT: v_mov_b32_e32 v0, s5 103; GFX7-NEXT: s_lshr_b32 s6, s0, 16 104; GFX7-NEXT: s_lshr_b32 s7, s0, 24 105; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 106; GFX7-NEXT: v_mov_b32_e32 v0, s6 107; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 108; GFX7-NEXT: v_mov_b32_e32 v0, s7 109; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 110; GFX7-NEXT: v_mov_b32_e32 v0, s1 111; GFX7-NEXT: s_lshr_b32 s0, s1, 8 112; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 113; GFX7-NEXT: v_mov_b32_e32 v0, s0 114; GFX7-NEXT: s_lshr_b32 s4, s1, 16 115; GFX7-NEXT: s_lshr_b32 s5, s1, 24 116; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 117; GFX7-NEXT: v_mov_b32_e32 v0, s4 118; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 119; GFX7-NEXT: v_mov_b32_e32 v0, s5 120; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 121; GFX7-NEXT: v_mov_b32_e32 v0, s2 122; GFX7-NEXT: s_lshr_b32 s0, s2, 8 123; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 124; GFX7-NEXT: v_mov_b32_e32 v0, s0 125; GFX7-NEXT: s_lshr_b32 s1, s2, 16 126; GFX7-NEXT: s_lshr_b32 s4, s2, 24 127; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 128; GFX7-NEXT: v_mov_b32_e32 v0, s1 129; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 130; GFX7-NEXT: v_mov_b32_e32 v0, s4 131; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 132; GFX7-NEXT: v_mov_b32_e32 v0, s3 133; GFX7-NEXT: s_lshr_b32 s0, s3, 8 134; GFX7-NEXT: ds_write_b8 v1, v0 offset:12 135; GFX7-NEXT: v_mov_b32_e32 v0, s0 136; GFX7-NEXT: s_lshr_b32 s1, s3, 16 137; GFX7-NEXT: ds_write_b8 v1, v0 offset:13 138; GFX7-NEXT: v_mov_b32_e32 v0, s1 139; GFX7-NEXT: s_lshr_b32 s2, s3, 24 140; GFX7-NEXT: ds_write_b8 v1, v0 offset:14 141; GFX7-NEXT: v_mov_b32_e32 v0, s2 142; GFX7-NEXT: ds_write_b8 v1, v0 offset:15 143; GFX7-NEXT: s_endpgm 144 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 145 ret void 146} 147 148define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 149; GFX9-LABEL: store_lds_v4i32_align2: 150; GFX9: ; %bb.0: 151; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 152; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 153; GFX9-NEXT: s_waitcnt lgkmcnt(0) 154; GFX9-NEXT: v_mov_b32_e32 v1, s4 155; GFX9-NEXT: v_mov_b32_e32 v0, s0 156; GFX9-NEXT: s_lshr_b32 s5, s0, 16 157; GFX9-NEXT: ds_write_b16 v1, v0 158; GFX9-NEXT: v_mov_b32_e32 v0, s5 159; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 160; GFX9-NEXT: v_mov_b32_e32 v0, s1 161; GFX9-NEXT: s_lshr_b32 s0, s1, 16 162; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 163; GFX9-NEXT: v_mov_b32_e32 v0, s0 164; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 165; GFX9-NEXT: v_mov_b32_e32 v0, s2 166; GFX9-NEXT: s_lshr_b32 s0, s2, 16 167; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 168; GFX9-NEXT: v_mov_b32_e32 v0, s0 169; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 170; GFX9-NEXT: v_mov_b32_e32 v0, s3 171; GFX9-NEXT: s_lshr_b32 s0, s3, 16 172; GFX9-NEXT: ds_write_b16 v1, v0 offset:12 173; GFX9-NEXT: v_mov_b32_e32 v0, s0 174; GFX9-NEXT: ds_write_b16 v1, v0 offset:14 175; GFX9-NEXT: s_endpgm 176; 177; GFX7-LABEL: store_lds_v4i32_align2: 178; GFX7: ; %bb.0: 179; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 180; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 181; GFX7-NEXT: s_mov_b32 m0, -1 182; GFX7-NEXT: s_waitcnt lgkmcnt(0) 183; GFX7-NEXT: v_mov_b32_e32 v1, s4 184; GFX7-NEXT: v_mov_b32_e32 v0, s0 185; GFX7-NEXT: s_lshr_b32 s5, s0, 16 186; GFX7-NEXT: ds_write_b16 v1, v0 187; GFX7-NEXT: v_mov_b32_e32 v0, s5 188; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 189; GFX7-NEXT: v_mov_b32_e32 v0, s1 190; GFX7-NEXT: s_lshr_b32 s0, s1, 16 191; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 192; GFX7-NEXT: v_mov_b32_e32 v0, s0 193; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 194; GFX7-NEXT: v_mov_b32_e32 v0, s2 195; GFX7-NEXT: s_lshr_b32 s0, s2, 16 196; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 197; GFX7-NEXT: v_mov_b32_e32 v0, s0 198; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 199; GFX7-NEXT: v_mov_b32_e32 v0, s3 200; GFX7-NEXT: s_lshr_b32 s0, s3, 16 201; GFX7-NEXT: ds_write_b16 v1, v0 offset:12 202; GFX7-NEXT: v_mov_b32_e32 v0, s0 203; GFX7-NEXT: ds_write_b16 v1, v0 offset:14 204; GFX7-NEXT: s_endpgm 205 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 206 ret void 207} 208 209define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 210; GFX9-LABEL: store_lds_v4i32_align4: 211; GFX9: ; %bb.0: 212; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 213; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: v_mov_b32_e32 v1, s4 216; GFX9-NEXT: v_mov_b32_e32 v0, s0 217; GFX9-NEXT: v_mov_b32_e32 v2, s1 218; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 219; GFX9-NEXT: v_mov_b32_e32 v3, s2 220; GFX9-NEXT: v_mov_b32_e32 v0, s3 221; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 222; GFX9-NEXT: s_endpgm 223; 224; GFX7-LABEL: store_lds_v4i32_align4: 225; GFX7: ; %bb.0: 226; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 227; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 228; GFX7-NEXT: s_mov_b32 m0, -1 229; GFX7-NEXT: s_waitcnt lgkmcnt(0) 230; GFX7-NEXT: v_mov_b32_e32 v1, s4 231; GFX7-NEXT: v_mov_b32_e32 v0, s0 232; GFX7-NEXT: v_mov_b32_e32 v2, s1 233; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 234; GFX7-NEXT: v_mov_b32_e32 v0, s2 235; GFX7-NEXT: v_mov_b32_e32 v2, s3 236; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset0:2 offset1:3 237; GFX7-NEXT: s_endpgm 238 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 239 ret void 240} 241 242define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 243; GFX9-LABEL: store_lds_v4i32_align8: 244; GFX9: ; %bb.0: 245; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 246; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 247; GFX9-NEXT: s_waitcnt lgkmcnt(0) 248; GFX9-NEXT: v_mov_b32_e32 v4, s4 249; GFX9-NEXT: v_mov_b32_e32 v0, s0 250; GFX9-NEXT: v_mov_b32_e32 v1, s1 251; GFX9-NEXT: v_mov_b32_e32 v2, s2 252; GFX9-NEXT: v_mov_b32_e32 v3, s3 253; GFX9-NEXT: ds_write_b128 v4, v[0:3] 254; GFX9-NEXT: s_endpgm 255; 256; GFX7-LABEL: store_lds_v4i32_align8: 257; GFX7: ; %bb.0: 258; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 259; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 260; GFX7-NEXT: s_mov_b32 m0, -1 261; GFX7-NEXT: s_waitcnt lgkmcnt(0) 262; GFX7-NEXT: v_mov_b32_e32 v4, s4 263; GFX7-NEXT: v_mov_b32_e32 v0, s0 264; GFX7-NEXT: v_mov_b32_e32 v1, s1 265; GFX7-NEXT: v_mov_b32_e32 v2, s2 266; GFX7-NEXT: v_mov_b32_e32 v3, s3 267; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 268; GFX7-NEXT: s_endpgm 269 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 270 ret void 271} 272 273define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 274; GFX9-LABEL: store_lds_v4i32_align16: 275; GFX9: ; %bb.0: 276; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 277; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 278; GFX9-NEXT: s_waitcnt lgkmcnt(0) 279; GFX9-NEXT: v_mov_b32_e32 v4, s4 280; GFX9-NEXT: v_mov_b32_e32 v0, s0 281; GFX9-NEXT: v_mov_b32_e32 v1, s1 282; GFX9-NEXT: v_mov_b32_e32 v2, s2 283; GFX9-NEXT: v_mov_b32_e32 v3, s3 284; GFX9-NEXT: ds_write_b128 v4, v[0:3] 285; GFX9-NEXT: s_endpgm 286; 287; GFX7-LABEL: store_lds_v4i32_align16: 288; GFX7: ; %bb.0: 289; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 290; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 291; GFX7-NEXT: s_mov_b32 m0, -1 292; GFX7-NEXT: s_waitcnt lgkmcnt(0) 293; GFX7-NEXT: v_mov_b32_e32 v4, s4 294; GFX7-NEXT: v_mov_b32_e32 v0, s0 295; GFX7-NEXT: v_mov_b32_e32 v1, s1 296; GFX7-NEXT: v_mov_b32_e32 v2, s2 297; GFX7-NEXT: v_mov_b32_e32 v3, s3 298; GFX7-NEXT: ds_write_b128 v4, v[0:3] 299; GFX7-NEXT: s_endpgm 300 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 301 ret void 302} 303