1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s 5 6define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) { 7; GFX9-LABEL: load_lds_v3i32: 8; GFX9: ; %bb.0: 9; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10; GFX9-NEXT: ds_read_b96 v[0:2], v0 11; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12; GFX9-NEXT: s_setpc_b64 s[30:31] 13; 14; GFX7-LABEL: load_lds_v3i32: 15; GFX7: ; %bb.0: 16; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17; GFX7-NEXT: s_mov_b32 m0, -1 18; GFX7-NEXT: ds_read_b96 v[0:2], v0 19; GFX7-NEXT: s_waitcnt lgkmcnt(0) 20; GFX7-NEXT: s_setpc_b64 s[30:31] 21; 22; GFX6-LABEL: load_lds_v3i32: 23; GFX6: ; %bb.0: 24; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 26; GFX6-NEXT: s_mov_b32 m0, -1 27; GFX6-NEXT: ds_read_b32 v2, v1 28; GFX6-NEXT: ds_read_b64 v[0:1], v0 29; GFX6-NEXT: s_waitcnt lgkmcnt(0) 30; GFX6-NEXT: s_setpc_b64 s[30:31] 31 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr 32 ret <3 x i32> %load 33} 34 35define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { 36; GFX9-LABEL: load_lds_v3i32_align1: 37; GFX9: ; %bb.0: 38; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39; GFX9-NEXT: ds_read_u8 v1, v0 40; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 41; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 42; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 43; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 44; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 45; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 46; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 47; GFX9-NEXT: ds_read_u8 v9, v0 offset:8 48; GFX9-NEXT: ds_read_u8 v10, v0 offset:9 49; GFX9-NEXT: ds_read_u8 v11, v0 offset:10 50; GFX9-NEXT: ds_read_u8 v12, v0 offset:11 51; GFX9-NEXT: s_waitcnt lgkmcnt(10) 52; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 53; GFX9-NEXT: s_waitcnt lgkmcnt(8) 54; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 55; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 56; GFX9-NEXT: s_waitcnt lgkmcnt(6) 57; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 58; GFX9-NEXT: s_waitcnt lgkmcnt(4) 59; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 60; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 61; GFX9-NEXT: s_waitcnt lgkmcnt(2) 62; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 63; GFX9-NEXT: s_waitcnt lgkmcnt(0) 64; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 65; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 66; GFX9-NEXT: s_setpc_b64 s[30:31] 67; 68; GFX7-LABEL: load_lds_v3i32_align1: 69; GFX7: ; %bb.0: 70; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 71; GFX7-NEXT: s_mov_b32 m0, -1 72; GFX7-NEXT: ds_read_u8 v1, v0 offset:7 73; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 74; GFX7-NEXT: ds_read_u8 v4, v0 offset:5 75; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 76; GFX7-NEXT: ds_read_u8 v3, v0 offset:3 77; GFX7-NEXT: ds_read_u8 v6, v0 offset:2 78; GFX7-NEXT: ds_read_u8 v7, v0 offset:1 79; GFX7-NEXT: ds_read_u8 v8, v0 80; GFX7-NEXT: s_waitcnt lgkmcnt(7) 81; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 82; GFX7-NEXT: s_waitcnt lgkmcnt(5) 83; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 84; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 85; GFX7-NEXT: s_waitcnt lgkmcnt(4) 86; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 87; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 88; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 89; GFX7-NEXT: ds_read_u8 v2, v0 offset:11 90; GFX7-NEXT: ds_read_u8 v4, v0 offset:10 91; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 92; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 93; GFX7-NEXT: s_waitcnt lgkmcnt(7) 94; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 95; GFX7-NEXT: s_waitcnt lgkmcnt(5) 96; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 97; GFX7-NEXT: v_or_b32_e32 v3, v3, v6 98; GFX7-NEXT: s_waitcnt lgkmcnt(3) 99; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 100; GFX7-NEXT: s_waitcnt lgkmcnt(2) 101; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 102; GFX7-NEXT: s_waitcnt lgkmcnt(1) 103; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 104; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 105; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 106; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 107; GFX7-NEXT: s_waitcnt lgkmcnt(0) 108; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 109; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 110; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 111; GFX7-NEXT: v_mov_b32_e32 v0, v3 112; GFX7-NEXT: s_setpc_b64 s[30:31] 113; 114; GFX6-LABEL: load_lds_v3i32_align1: 115; GFX6: ; %bb.0: 116; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 117; GFX6-NEXT: v_add_i32_e32 v1, vcc, 5, v0 118; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 119; GFX6-NEXT: v_add_i32_e32 v3, vcc, 7, v0 120; GFX6-NEXT: v_add_i32_e32 v4, vcc, 6, v0 121; GFX6-NEXT: v_add_i32_e32 v5, vcc, 9, v0 122; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0 123; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0 124; GFX6-NEXT: s_mov_b32 m0, -1 125; GFX6-NEXT: ds_read_u8 v2, v2 126; GFX6-NEXT: ds_read_u8 v3, v3 127; GFX6-NEXT: ds_read_u8 v4, v4 128; GFX6-NEXT: ds_read_u8 v5, v5 129; GFX6-NEXT: ds_read_u8 v6, v6 130; GFX6-NEXT: ds_read_u8 v7, v7 131; GFX6-NEXT: ds_read_u8 v1, v1 132; GFX6-NEXT: ds_read_u8 v8, v0 133; GFX6-NEXT: s_waitcnt lgkmcnt(1) 134; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 135; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 136; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3 137; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 138; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 139; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 140; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5 141; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 142; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 143; GFX6-NEXT: v_add_i32_e32 v5, vcc, 3, v0 144; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0 145; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 146; GFX6-NEXT: ds_read_u8 v4, v4 147; GFX6-NEXT: ds_read_u8 v5, v5 148; GFX6-NEXT: ds_read_u8 v6, v6 149; GFX6-NEXT: ds_read_u8 v0, v0 150; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7 151; GFX6-NEXT: s_waitcnt lgkmcnt(3) 152; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 153; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 154; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 155; GFX6-NEXT: s_waitcnt lgkmcnt(2) 156; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v5 157; GFX6-NEXT: s_waitcnt lgkmcnt(1) 158; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 159; GFX6-NEXT: s_waitcnt lgkmcnt(0) 160; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 161; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 162; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 163; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 164; GFX6-NEXT: s_setpc_b64 s[30:31] 165 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 166 ret <3 x i32> %load 167} 168 169define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) { 170; GFX9-LABEL: load_lds_v3i32_align2: 171; GFX9: ; %bb.0: 172; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 173; GFX9-NEXT: ds_read_u16 v1, v0 174; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 175; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 176; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 177; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 178; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 179; GFX9-NEXT: s_waitcnt lgkmcnt(4) 180; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 181; GFX9-NEXT: s_waitcnt lgkmcnt(2) 182; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 183; GFX9-NEXT: s_waitcnt lgkmcnt(0) 184; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 185; GFX9-NEXT: s_setpc_b64 s[30:31] 186; 187; GFX7-LABEL: load_lds_v3i32_align2: 188; GFX7: ; %bb.0: 189; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX7-NEXT: s_mov_b32 m0, -1 191; GFX7-NEXT: ds_read_u16 v2, v0 offset:10 192; GFX7-NEXT: ds_read_u16 v3, v0 offset:8 193; GFX7-NEXT: ds_read_u16 v1, v0 offset:6 194; GFX7-NEXT: ds_read_u16 v4, v0 offset:4 195; GFX7-NEXT: ds_read_u16 v5, v0 offset:2 196; GFX7-NEXT: ds_read_u16 v0, v0 197; GFX7-NEXT: s_waitcnt lgkmcnt(5) 198; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 199; GFX7-NEXT: s_waitcnt lgkmcnt(3) 200; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 201; GFX7-NEXT: s_waitcnt lgkmcnt(2) 202; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 203; GFX7-NEXT: s_waitcnt lgkmcnt(1) 204; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 205; GFX7-NEXT: s_waitcnt lgkmcnt(0) 206; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 207; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 208; GFX7-NEXT: s_setpc_b64 s[30:31] 209; 210; GFX6-LABEL: load_lds_v3i32_align2: 211; GFX6: ; %bb.0: 212; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 213; GFX6-NEXT: v_add_i32_e32 v1, vcc, 6, v0 214; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 215; GFX6-NEXT: v_add_i32_e32 v3, vcc, 10, v0 216; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0 217; GFX6-NEXT: v_add_i32_e32 v5, vcc, 2, v0 218; GFX6-NEXT: s_mov_b32 m0, -1 219; GFX6-NEXT: ds_read_u16 v2, v2 220; GFX6-NEXT: ds_read_u16 v3, v3 221; GFX6-NEXT: ds_read_u16 v4, v4 222; GFX6-NEXT: ds_read_u16 v5, v5 223; GFX6-NEXT: ds_read_u16 v1, v1 224; GFX6-NEXT: ds_read_u16 v0, v0 225; GFX6-NEXT: s_waitcnt lgkmcnt(1) 226; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 227; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 228; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 229; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 230; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 231; GFX6-NEXT: s_waitcnt lgkmcnt(0) 232; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 233; GFX6-NEXT: s_setpc_b64 s[30:31] 234 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 235 ret <3 x i32> %load 236} 237 238define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) { 239; GFX9-LABEL: load_lds_v3i32_align4: 240; GFX9: ; %bb.0: 241; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 242; GFX9-NEXT: v_mov_b32_e32 v2, v0 243; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 244; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 245; GFX9-NEXT: s_waitcnt lgkmcnt(0) 246; GFX9-NEXT: s_setpc_b64 s[30:31] 247; 248; GFX7-LABEL: load_lds_v3i32_align4: 249; GFX7: ; %bb.0: 250; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 251; GFX7-NEXT: v_mov_b32_e32 v2, v0 252; GFX7-NEXT: s_mov_b32 m0, -1 253; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 254; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 255; GFX7-NEXT: s_waitcnt lgkmcnt(0) 256; GFX7-NEXT: s_setpc_b64 s[30:31] 257; 258; GFX6-LABEL: load_lds_v3i32_align4: 259; GFX6: ; %bb.0: 260; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 261; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0 262; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 263; GFX6-NEXT: s_mov_b32 m0, -1 264; GFX6-NEXT: ds_read_b32 v2, v2 265; GFX6-NEXT: ds_read_b32 v1, v1 266; GFX6-NEXT: ds_read_b32 v0, v0 267; GFX6-NEXT: s_waitcnt lgkmcnt(0) 268; GFX6-NEXT: s_setpc_b64 s[30:31] 269 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 270 ret <3 x i32> %load 271} 272 273define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) { 274; GFX9-LABEL: load_lds_v3i32_align8: 275; GFX9: ; %bb.0: 276; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 277; GFX9-NEXT: v_mov_b32_e32 v2, v0 278; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 279; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 280; GFX9-NEXT: s_waitcnt lgkmcnt(0) 281; GFX9-NEXT: s_setpc_b64 s[30:31] 282; 283; GFX7-LABEL: load_lds_v3i32_align8: 284; GFX7: ; %bb.0: 285; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 286; GFX7-NEXT: v_mov_b32_e32 v2, v0 287; GFX7-NEXT: s_mov_b32 m0, -1 288; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 289; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 290; GFX7-NEXT: s_waitcnt lgkmcnt(0) 291; GFX7-NEXT: s_setpc_b64 s[30:31] 292; 293; GFX6-LABEL: load_lds_v3i32_align8: 294; GFX6: ; %bb.0: 295; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 296; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0 297; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 298; GFX6-NEXT: s_mov_b32 m0, -1 299; GFX6-NEXT: ds_read_b32 v2, v2 300; GFX6-NEXT: ds_read_b32 v1, v1 301; GFX6-NEXT: ds_read_b32 v0, v0 302; GFX6-NEXT: s_waitcnt lgkmcnt(0) 303; GFX6-NEXT: s_setpc_b64 s[30:31] 304 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 305 ret <3 x i32> %load 306} 307 308define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) { 309; GFX9-LABEL: load_lds_v3i32_align16: 310; GFX9: ; %bb.0: 311; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 312; GFX9-NEXT: ds_read_b96 v[0:2], v0 313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 314; GFX9-NEXT: s_setpc_b64 s[30:31] 315; 316; GFX7-LABEL: load_lds_v3i32_align16: 317; GFX7: ; %bb.0: 318; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; GFX7-NEXT: s_mov_b32 m0, -1 320; GFX7-NEXT: ds_read_b96 v[0:2], v0 321; GFX7-NEXT: s_waitcnt lgkmcnt(0) 322; GFX7-NEXT: s_setpc_b64 s[30:31] 323; 324; GFX6-LABEL: load_lds_v3i32_align16: 325; GFX6: ; %bb.0: 326; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 327; GFX6-NEXT: v_add_i32_e32 v1, vcc, 8, v0 328; GFX6-NEXT: s_mov_b32 m0, -1 329; GFX6-NEXT: ds_read_b32 v2, v1 330; GFX6-NEXT: ds_read_b64 v[0:1], v0 331; GFX6-NEXT: s_waitcnt lgkmcnt(0) 332; GFX6-NEXT: s_setpc_b64 s[30:31] 333 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 334 ret <3 x i32> %load 335} 336