1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4 5; Test splitting flat instruction offsets into the low and high bits 6; when the offset doesn't fit in the offset field. 7 8define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) { 9; GFX9-LABEL: global_inst_valu_offset_1: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 13; GFX9-NEXT: s_waitcnt vmcnt(0) 14; GFX9-NEXT: s_setpc_b64 s[30:31] 15; 16; GFX10-LABEL: global_inst_valu_offset_1: 17; GFX10: ; %bb.0: 18; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 20; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 21; GFX10-NEXT: s_waitcnt vmcnt(0) 22; GFX10-NEXT: s_setpc_b64 s[30:31] 23 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 24 %load = load i8, i8 addrspace(1)* %gep, align 4 25 ret i8 %load 26} 27 28define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) { 29; GFX9-LABEL: global_inst_valu_offset_11bit_max: 30; GFX9: ; %bb.0: 31; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 33; GFX9-NEXT: s_waitcnt vmcnt(0) 34; GFX9-NEXT: s_setpc_b64 s[30:31] 35; 36; GFX10-LABEL: global_inst_valu_offset_11bit_max: 37; GFX10: ; %bb.0: 38; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 40; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 41; GFX10-NEXT: s_waitcnt vmcnt(0) 42; GFX10-NEXT: s_setpc_b64 s[30:31] 43 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 44 %load = load i8, i8 addrspace(1)* %gep, align 4 45 ret i8 %load 46} 47 48define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) { 49; GFX9-LABEL: global_inst_valu_offset_12bit_max: 50; GFX9: ; %bb.0: 51; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 53; GFX9-NEXT: s_waitcnt vmcnt(0) 54; GFX9-NEXT: s_setpc_b64 s[30:31] 55; 56; GFX10-LABEL: global_inst_valu_offset_12bit_max: 57; GFX10: ; %bb.0: 58; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 59; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 60; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 61; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 62; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 63; GFX10-NEXT: s_waitcnt vmcnt(0) 64; GFX10-NEXT: s_setpc_b64 s[30:31] 65 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 66 %load = load i8, i8 addrspace(1)* %gep, align 4 67 ret i8 %load 68} 69 70define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) { 71; GFX9-LABEL: global_inst_valu_offset_13bit_max: 72; GFX9: ; %bb.0: 73; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 74; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 75; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 76; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 77; GFX9-NEXT: s_waitcnt vmcnt(0) 78; GFX9-NEXT: s_setpc_b64 s[30:31] 79; 80; GFX10-LABEL: global_inst_valu_offset_13bit_max: 81; GFX10: ; %bb.0: 82; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 84; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 85; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 86; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 87; GFX10-NEXT: s_waitcnt vmcnt(0) 88; GFX10-NEXT: s_setpc_b64 s[30:31] 89 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 90 %load = load i8, i8 addrspace(1)* %gep, align 4 91 ret i8 %load 92} 93 94define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) { 95; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max: 96; GFX9: ; %bb.0: 97; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 98; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 99; GFX9-NEXT: s_waitcnt vmcnt(0) 100; GFX9-NEXT: s_setpc_b64 s[30:31] 101; 102; GFX10-LABEL: global_inst_valu_offset_neg_11bit_max: 103; GFX10: ; %bb.0: 104; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 106; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 107; GFX10-NEXT: s_waitcnt vmcnt(0) 108; GFX10-NEXT: s_setpc_b64 s[30:31] 109 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 110 %load = load i8, i8 addrspace(1)* %gep, align 4 111 ret i8 %load 112} 113 114define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) { 115; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max: 116; GFX9: ; %bb.0: 117; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 118; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 119; GFX9-NEXT: s_waitcnt vmcnt(0) 120; GFX9-NEXT: s_setpc_b64 s[30:31] 121; 122; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max: 123; GFX10: ; %bb.0: 124; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 125; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 126; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 127; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 128; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 129; GFX10-NEXT: s_waitcnt vmcnt(0) 130; GFX10-NEXT: s_setpc_b64 s[30:31] 131 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 132 %load = load i8, i8 addrspace(1)* %gep, align 4 133 ret i8 %load 134} 135 136define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) { 137; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max: 138; GFX9: ; %bb.0: 139; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 140; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 141; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 142; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 143; GFX9-NEXT: s_waitcnt vmcnt(0) 144; GFX9-NEXT: s_setpc_b64 s[30:31] 145; 146; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max: 147; GFX10: ; %bb.0: 148; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 150; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 151; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 152; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 153; GFX10-NEXT: s_waitcnt vmcnt(0) 154; GFX10-NEXT: s_setpc_b64 s[30:31] 155 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 156 %load = load i8, i8 addrspace(1)* %gep, align 4 157 ret i8 %load 158} 159 160define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) { 161; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max: 162; GFX9: ; %bb.0: 163; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 164; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 165; GFX9-NEXT: s_waitcnt vmcnt(0) 166; GFX9-NEXT: s_setpc_b64 s[30:31] 167; 168; GFX10-LABEL: global_inst_valu_offset_2x_11bit_max: 169; GFX10: ; %bb.0: 170; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 171; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 172; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 173; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 174; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 175; GFX10-NEXT: s_waitcnt vmcnt(0) 176; GFX10-NEXT: s_setpc_b64 s[30:31] 177 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 178 %load = load i8, i8 addrspace(1)* %gep, align 4 179 ret i8 %load 180} 181 182define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) { 183; GFX9-LABEL: global_inst_valu_offset_2x_12bit_max: 184; GFX9: ; %bb.0: 185; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 186; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 187; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 188; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 189; GFX9-NEXT: s_waitcnt vmcnt(0) 190; GFX9-NEXT: s_setpc_b64 s[30:31] 191; 192; GFX10-LABEL: global_inst_valu_offset_2x_12bit_max: 193; GFX10: ; %bb.0: 194; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 196; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 197; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 198; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 199; GFX10-NEXT: s_waitcnt vmcnt(0) 200; GFX10-NEXT: s_setpc_b64 s[30:31] 201 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 202 %load = load i8, i8 addrspace(1)* %gep, align 4 203 ret i8 %load 204} 205 206define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) { 207; GFX9-LABEL: global_inst_valu_offset_2x_13bit_max: 208; GFX9: ; %bb.0: 209; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 210; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 211; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 212; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 213; GFX9-NEXT: s_waitcnt vmcnt(0) 214; GFX9-NEXT: s_setpc_b64 s[30:31] 215; 216; GFX10-LABEL: global_inst_valu_offset_2x_13bit_max: 217; GFX10: ; %bb.0: 218; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 220; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3800, v0 221; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 222; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 223; GFX10-NEXT: s_waitcnt vmcnt(0) 224; GFX10-NEXT: s_setpc_b64 s[30:31] 225 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 226 %load = load i8, i8 addrspace(1)* %gep, align 4 227 ret i8 %load 228} 229 230define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) { 231; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max: 232; GFX9: ; %bb.0: 233; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 235; GFX9-NEXT: s_waitcnt vmcnt(0) 236; GFX9-NEXT: s_setpc_b64 s[30:31] 237; 238; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max: 239; GFX10: ; %bb.0: 240; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 241; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 242; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 243; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 244; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 245; GFX10-NEXT: s_waitcnt vmcnt(0) 246; GFX10-NEXT: s_setpc_b64 s[30:31] 247 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 248 %load = load i8, i8 addrspace(1)* %gep, align 4 249 ret i8 %load 250} 251 252define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) { 253; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max: 254; GFX9: ; %bb.0: 255; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 256; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 257; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 258; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 259; GFX9-NEXT: s_waitcnt vmcnt(0) 260; GFX9-NEXT: s_setpc_b64 s[30:31] 261; 262; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max: 263; GFX10: ; %bb.0: 264; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 265; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 266; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 267; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 268; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 269; GFX10-NEXT: s_waitcnt vmcnt(0) 270; GFX10-NEXT: s_setpc_b64 s[30:31] 271 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 272 %load = load i8, i8 addrspace(1)* %gep, align 4 273 ret i8 %load 274} 275 276define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) { 277; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max: 278; GFX9: ; %bb.0: 279; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 281; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 282; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 283; GFX9-NEXT: s_waitcnt vmcnt(0) 284; GFX9-NEXT: s_setpc_b64 s[30:31] 285; 286; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max: 287; GFX10: ; %bb.0: 288; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 289; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0 291; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 292; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 293; GFX10-NEXT: s_waitcnt vmcnt(0) 294; GFX10-NEXT: s_setpc_b64 s[30:31] 295 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 296 %load = load i8, i8 addrspace(1)* %gep, align 4 297 ret i8 %load 298} 299 300; Fill 11-bit low-bits (1ull << 33) | 2047 301define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) { 302; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split0: 303; GFX9: ; %bb.0: 304; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 305; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 306; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 307; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 308; GFX9-NEXT: s_waitcnt vmcnt(0) 309; GFX9-NEXT: s_setpc_b64 s[30:31] 310; 311; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split0: 312; GFX10: ; %bb.0: 313; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 314; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 315; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0 316; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 317; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 318; GFX10-NEXT: s_waitcnt vmcnt(0) 319; GFX10-NEXT: s_setpc_b64 s[30:31] 320 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 321 %load = load i8, i8 addrspace(1)* %gep, align 4 322 ret i8 %load 323} 324 325; Fill 11-bit low-bits (1ull << 33) | 2048 326define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) { 327; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split1: 328; GFX9: ; %bb.0: 329; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 330; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 331; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 332; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 333; GFX9-NEXT: s_waitcnt vmcnt(0) 334; GFX9-NEXT: s_setpc_b64 s[30:31] 335; 336; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1: 337; GFX10: ; %bb.0: 338; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 339; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 340; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 341; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 342; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 343; GFX10-NEXT: s_waitcnt vmcnt(0) 344; GFX10-NEXT: s_setpc_b64 s[30:31] 345 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 346 %load = load i8, i8 addrspace(1)* %gep, align 4 347 ret i8 %load 348} 349 350; Fill 12-bit low-bits (1ull << 33) | 4095 351define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) { 352; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split0: 353; GFX9: ; %bb.0: 354; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 355; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 356; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 357; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 358; GFX9-NEXT: s_waitcnt vmcnt(0) 359; GFX9-NEXT: s_setpc_b64 s[30:31] 360; 361; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split0: 362; GFX10: ; %bb.0: 363; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 365; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 366; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 367; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 368; GFX10-NEXT: s_waitcnt vmcnt(0) 369; GFX10-NEXT: s_setpc_b64 s[30:31] 370 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 371 %load = load i8, i8 addrspace(1)* %gep, align 4 372 ret i8 %load 373} 374 375; Fill 12-bit low-bits (1ull << 33) | 4096 376define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) { 377; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1: 378; GFX9: ; %bb.0: 379; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 380; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 381; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 382; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 383; GFX9-NEXT: s_waitcnt vmcnt(0) 384; GFX9-NEXT: s_setpc_b64 s[30:31] 385; 386; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1: 387; GFX10: ; %bb.0: 388; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 389; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 390; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 391; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 392; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 393; GFX10-NEXT: s_waitcnt vmcnt(0) 394; GFX10-NEXT: s_setpc_b64 s[30:31] 395 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 396 %load = load i8, i8 addrspace(1)* %gep, align 4 397 ret i8 %load 398} 399 400; Fill 13-bit low-bits (1ull << 33) | 8191 401define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) { 402; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split0: 403; GFX9: ; %bb.0: 404; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 405; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 406; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 407; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 408; GFX9-NEXT: s_waitcnt vmcnt(0) 409; GFX9-NEXT: s_setpc_b64 s[30:31] 410; 411; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split0: 412; GFX10: ; %bb.0: 413; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 414; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 415; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 416; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 417; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 418; GFX10-NEXT: s_waitcnt vmcnt(0) 419; GFX10-NEXT: s_setpc_b64 s[30:31] 420 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 421 %load = load i8, i8 addrspace(1)* %gep, align 4 422 ret i8 %load 423} 424 425; Fill 13-bit low-bits (1ull << 33) | 8192 426define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) { 427; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1: 428; GFX9: ; %bb.0: 429; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 430; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 431; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 432; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 433; GFX9-NEXT: s_waitcnt vmcnt(0) 434; GFX9-NEXT: s_setpc_b64 s[30:31] 435; 436; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1: 437; GFX10: ; %bb.0: 438; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 439; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 440; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 441; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 442; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 443; GFX10-NEXT: s_waitcnt vmcnt(0) 444; GFX10-NEXT: s_setpc_b64 s[30:31] 445 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 446 %load = load i8, i8 addrspace(1)* %gep, align 4 447 ret i8 %load 448} 449 450; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 451define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) { 452; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: 453; GFX9: ; %bb.0: 454; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 455; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 456; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 457; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 458; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 459; GFX9-NEXT: s_waitcnt vmcnt(0) 460; GFX9-NEXT: s_setpc_b64 s[30:31] 461; 462; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: 463; GFX10: ; %bb.0: 464; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 465; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 466; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 467; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 468; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 469; GFX10-NEXT: s_waitcnt vmcnt(0) 470; GFX10-NEXT: s_setpc_b64 s[30:31] 471 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 472 %load = load i8, i8 addrspace(1)* %gep, align 4 473 ret i8 %load 474} 475 476; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 477define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) { 478; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: 479; GFX9: ; %bb.0: 480; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 481; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 482; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 483; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 484; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 485; GFX9-NEXT: s_waitcnt vmcnt(0) 486; GFX9-NEXT: s_setpc_b64 s[30:31] 487; 488; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: 489; GFX10: ; %bb.0: 490; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 491; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 492; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 493; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 494; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 495; GFX10-NEXT: s_waitcnt vmcnt(0) 496; GFX10-NEXT: s_setpc_b64 s[30:31] 497 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 498 %load = load i8, i8 addrspace(1)* %gep, align 4 499 ret i8 %load 500} 501 502; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 503define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) { 504; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: 505; GFX9: ; %bb.0: 506; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 507; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 508; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 509; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 510; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 511; GFX9-NEXT: s_waitcnt vmcnt(0) 512; GFX9-NEXT: s_setpc_b64 s[30:31] 513; 514; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: 515; GFX10: ; %bb.0: 516; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 517; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 518; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 519; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 520; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 521; GFX10-NEXT: s_waitcnt vmcnt(0) 522; GFX10-NEXT: s_setpc_b64 s[30:31] 523 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 524 %load = load i8, i8 addrspace(1)* %gep, align 4 525 ret i8 %load 526} 527 528; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 529define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) { 530; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: 531; GFX9: ; %bb.0: 532; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 533; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 534; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 535; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 536; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 537; GFX9-NEXT: s_waitcnt vmcnt(0) 538; GFX9-NEXT: s_setpc_b64 s[30:31] 539; 540; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: 541; GFX10: ; %bb.0: 542; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 543; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 544; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 545; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 546; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 547; GFX10-NEXT: s_waitcnt vmcnt(0) 548; GFX10-NEXT: s_setpc_b64 s[30:31] 549 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 550 %load = load i8, i8 addrspace(1)* %gep, align 4 551 ret i8 %load 552} 553 554; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 555define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) { 556; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: 557; GFX9: ; %bb.0: 558; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 559; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 560; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 561; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 562; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 563; GFX9-NEXT: s_waitcnt vmcnt(0) 564; GFX9-NEXT: s_setpc_b64 s[30:31] 565; 566; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: 567; GFX10: ; %bb.0: 568; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 569; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 570; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 571; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 572; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 573; GFX10-NEXT: s_waitcnt vmcnt(0) 574; GFX10-NEXT: s_setpc_b64 s[30:31] 575 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 576 %load = load i8, i8 addrspace(1)* %gep, align 4 577 ret i8 %load 578} 579 580; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 581define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) { 582; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: 583; GFX9: ; %bb.0: 584; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 585; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 586; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 587; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 588; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 589; GFX9-NEXT: s_waitcnt vmcnt(0) 590; GFX9-NEXT: s_setpc_b64 s[30:31] 591; 592; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: 593; GFX10: ; %bb.0: 594; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 595; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 596; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 597; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 598; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 599; GFX10-NEXT: s_waitcnt vmcnt(0) 600; GFX10-NEXT: s_setpc_b64 s[30:31] 601 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 602 %load = load i8, i8 addrspace(1)* %gep, align 4 603 ret i8 %load 604} 605 606define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) { 607; GFX9-LABEL: global_inst_salu_offset_1: 608; GFX9: ; %bb.0: 609; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 610; GFX9-NEXT: v_mov_b32_e32 v0, 0 611; GFX9-NEXT: s_waitcnt lgkmcnt(0) 612; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 613; GFX9-NEXT: s_waitcnt vmcnt(0) 614; GFX9-NEXT: global_store_byte v[0:1], v0, off 615; GFX9-NEXT: s_endpgm 616; 617; GFX10-LABEL: global_inst_salu_offset_1: 618; GFX10: ; %bb.0: 619; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 620; GFX10-NEXT: v_mov_b32_e32 v0, 0 621; GFX10-NEXT: s_waitcnt lgkmcnt(0) 622; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 623; GFX10-NEXT: s_waitcnt vmcnt(0) 624; GFX10-NEXT: global_store_byte v[0:1], v0, off 625; GFX10-NEXT: s_endpgm 626 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 627 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 628 store i8 %load, i8 addrspace(1)* undef 629 ret void 630} 631 632define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) { 633; GFX9-LABEL: global_inst_salu_offset_11bit_max: 634; GFX9: ; %bb.0: 635; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 636; GFX9-NEXT: v_mov_b32_e32 v0, 0 637; GFX9-NEXT: s_waitcnt lgkmcnt(0) 638; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 639; GFX9-NEXT: s_waitcnt vmcnt(0) 640; GFX9-NEXT: global_store_byte v[0:1], v0, off 641; GFX9-NEXT: s_endpgm 642; 643; GFX10-LABEL: global_inst_salu_offset_11bit_max: 644; GFX10: ; %bb.0: 645; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 646; GFX10-NEXT: v_mov_b32_e32 v0, 0 647; GFX10-NEXT: s_waitcnt lgkmcnt(0) 648; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 649; GFX10-NEXT: s_waitcnt vmcnt(0) 650; GFX10-NEXT: global_store_byte v[0:1], v0, off 651; GFX10-NEXT: s_endpgm 652 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 653 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 654 store i8 %load, i8 addrspace(1)* undef 655 ret void 656} 657 658define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) { 659; GFX9-LABEL: global_inst_salu_offset_12bit_max: 660; GFX9: ; %bb.0: 661; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 662; GFX9-NEXT: v_mov_b32_e32 v0, 0 663; GFX9-NEXT: s_waitcnt lgkmcnt(0) 664; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 665; GFX9-NEXT: s_waitcnt vmcnt(0) 666; GFX9-NEXT: global_store_byte v[0:1], v0, off 667; GFX9-NEXT: s_endpgm 668; 669; GFX10-LABEL: global_inst_salu_offset_12bit_max: 670; GFX10: ; %bb.0: 671; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 672; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 673; GFX10-NEXT: s_waitcnt lgkmcnt(0) 674; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 675; GFX10-NEXT: s_waitcnt vmcnt(0) 676; GFX10-NEXT: global_store_byte v[0:1], v0, off 677; GFX10-NEXT: s_endpgm 678 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 679 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 680 store i8 %load, i8 addrspace(1)* undef 681 ret void 682} 683 684define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) { 685; GFX9-LABEL: global_inst_salu_offset_13bit_max: 686; GFX9: ; %bb.0: 687; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 688; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 689; GFX9-NEXT: s_waitcnt lgkmcnt(0) 690; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 691; GFX9-NEXT: s_waitcnt vmcnt(0) 692; GFX9-NEXT: global_store_byte v[0:1], v0, off 693; GFX9-NEXT: s_endpgm 694; 695; GFX10-LABEL: global_inst_salu_offset_13bit_max: 696; GFX10: ; %bb.0: 697; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 698; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 699; GFX10-NEXT: s_waitcnt lgkmcnt(0) 700; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 701; GFX10-NEXT: s_waitcnt vmcnt(0) 702; GFX10-NEXT: global_store_byte v[0:1], v0, off 703; GFX10-NEXT: s_endpgm 704 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 705 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 706 store i8 %load, i8 addrspace(1)* undef 707 ret void 708} 709 710define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) { 711; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max: 712; GFX9: ; %bb.0: 713; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 714; GFX9-NEXT: v_mov_b32_e32 v0, 0 715; GFX9-NEXT: s_waitcnt lgkmcnt(0) 716; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 717; GFX9-NEXT: s_waitcnt vmcnt(0) 718; GFX9-NEXT: global_store_byte v[0:1], v0, off 719; GFX9-NEXT: s_endpgm 720; 721; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max: 722; GFX10: ; %bb.0: 723; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 724; GFX10-NEXT: v_mov_b32_e32 v0, 0 725; GFX10-NEXT: s_waitcnt lgkmcnt(0) 726; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 727; GFX10-NEXT: s_waitcnt vmcnt(0) 728; GFX10-NEXT: global_store_byte v[0:1], v0, off 729; GFX10-NEXT: s_endpgm 730 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 731 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 732 store i8 %load, i8 addrspace(1)* undef 733 ret void 734} 735 736define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) { 737; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max: 738; GFX9: ; %bb.0: 739; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 740; GFX9-NEXT: v_mov_b32_e32 v0, 0 741; GFX9-NEXT: s_waitcnt lgkmcnt(0) 742; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 743; GFX9-NEXT: s_waitcnt vmcnt(0) 744; GFX9-NEXT: global_store_byte v[0:1], v0, off 745; GFX9-NEXT: s_endpgm 746; 747; GFX10-LABEL: global_inst_salu_offset_neg_12bit_max: 748; GFX10: ; %bb.0: 749; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 750; GFX10-NEXT: s_waitcnt lgkmcnt(0) 751; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 752; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 753; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 754; GFX10-NEXT: s_waitcnt vmcnt(0) 755; GFX10-NEXT: global_store_byte v[0:1], v0, off 756; GFX10-NEXT: s_endpgm 757 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 758 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 759 store i8 %load, i8 addrspace(1)* undef 760 ret void 761} 762 763define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) { 764; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: 765; GFX9: ; %bb.0: 766; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 767; GFX9-NEXT: s_waitcnt lgkmcnt(0) 768; GFX9-NEXT: v_mov_b32_e32 v0, s0 769; GFX9-NEXT: v_mov_b32_e32 v1, s1 770; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 771; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 772; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 773; GFX9-NEXT: s_waitcnt vmcnt(0) 774; GFX9-NEXT: global_store_byte v[0:1], v0, off 775; GFX9-NEXT: s_endpgm 776; 777; GFX10-LABEL: global_inst_salu_offset_neg_13bit_max: 778; GFX10: ; %bb.0: 779; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 780; GFX10-NEXT: s_waitcnt lgkmcnt(0) 781; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 782; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 783; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 784; GFX10-NEXT: s_waitcnt vmcnt(0) 785; GFX10-NEXT: global_store_byte v[0:1], v0, off 786; GFX10-NEXT: s_endpgm 787 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 788 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 789 store i8 %load, i8 addrspace(1)* undef 790 ret void 791} 792 793define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) { 794; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max: 795; GFX9: ; %bb.0: 796; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 797; GFX9-NEXT: v_mov_b32_e32 v0, 0 798; GFX9-NEXT: s_waitcnt lgkmcnt(0) 799; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 800; GFX9-NEXT: s_waitcnt vmcnt(0) 801; GFX9-NEXT: global_store_byte v[0:1], v0, off 802; GFX9-NEXT: s_endpgm 803; 804; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max: 805; GFX10: ; %bb.0: 806; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 807; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 808; GFX10-NEXT: s_waitcnt lgkmcnt(0) 809; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 810; GFX10-NEXT: s_waitcnt vmcnt(0) 811; GFX10-NEXT: global_store_byte v[0:1], v0, off 812; GFX10-NEXT: s_endpgm 813 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 814 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 815 store i8 %load, i8 addrspace(1)* undef 816 ret void 817} 818 819define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) { 820; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max: 821; GFX9: ; %bb.0: 822; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 823; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 824; GFX9-NEXT: s_waitcnt lgkmcnt(0) 825; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 826; GFX9-NEXT: s_waitcnt vmcnt(0) 827; GFX9-NEXT: global_store_byte v[0:1], v0, off 828; GFX9-NEXT: s_endpgm 829; 830; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max: 831; GFX10: ; %bb.0: 832; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 833; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 834; GFX10-NEXT: s_waitcnt lgkmcnt(0) 835; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 836; GFX10-NEXT: s_waitcnt vmcnt(0) 837; GFX10-NEXT: global_store_byte v[0:1], v0, off 838; GFX10-NEXT: s_endpgm 839 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 840 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 841 store i8 %load, i8 addrspace(1)* undef 842 ret void 843} 844 845define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) { 846; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max: 847; GFX9: ; %bb.0: 848; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 849; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000 850; GFX9-NEXT: s_waitcnt lgkmcnt(0) 851; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 852; GFX9-NEXT: s_waitcnt vmcnt(0) 853; GFX9-NEXT: global_store_byte v[0:1], v0, off 854; GFX9-NEXT: s_endpgm 855; 856; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max: 857; GFX10: ; %bb.0: 858; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 859; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 860; GFX10-NEXT: s_waitcnt lgkmcnt(0) 861; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 862; GFX10-NEXT: s_waitcnt vmcnt(0) 863; GFX10-NEXT: global_store_byte v[0:1], v0, off 864; GFX10-NEXT: s_endpgm 865 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 866 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 867 store i8 %load, i8 addrspace(1)* undef 868 ret void 869} 870 871define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) { 872; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max: 873; GFX9: ; %bb.0: 874; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 875; GFX9-NEXT: v_mov_b32_e32 v0, 0 876; GFX9-NEXT: s_waitcnt lgkmcnt(0) 877; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 878; GFX9-NEXT: s_waitcnt vmcnt(0) 879; GFX9-NEXT: global_store_byte v[0:1], v0, off 880; GFX9-NEXT: s_endpgm 881; 882; GFX10-LABEL: global_inst_salu_offset_2x_neg_11bit_max: 883; GFX10: ; %bb.0: 884; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 885; GFX10-NEXT: s_waitcnt lgkmcnt(0) 886; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 887; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 888; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 889; GFX10-NEXT: s_waitcnt vmcnt(0) 890; GFX10-NEXT: global_store_byte v[0:1], v0, off 891; GFX10-NEXT: s_endpgm 892 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 893 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 894 store i8 %load, i8 addrspace(1)* undef 895 ret void 896} 897 898define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) { 899; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: 900; GFX9: ; %bb.0: 901; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 902; GFX9-NEXT: s_waitcnt lgkmcnt(0) 903; GFX9-NEXT: v_mov_b32_e32 v0, s0 904; GFX9-NEXT: v_mov_b32_e32 v1, s1 905; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 906; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 907; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 908; GFX9-NEXT: s_waitcnt vmcnt(0) 909; GFX9-NEXT: global_store_byte v[0:1], v0, off 910; GFX9-NEXT: s_endpgm 911; 912; GFX10-LABEL: global_inst_salu_offset_2x_neg_12bit_max: 913; GFX10: ; %bb.0: 914; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 915; GFX10-NEXT: s_waitcnt lgkmcnt(0) 916; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 917; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 918; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 919; GFX10-NEXT: s_waitcnt vmcnt(0) 920; GFX10-NEXT: global_store_byte v[0:1], v0, off 921; GFX10-NEXT: s_endpgm 922 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 923 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 924 store i8 %load, i8 addrspace(1)* undef 925 ret void 926} 927 928define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) { 929; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: 930; GFX9: ; %bb.0: 931; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 932; GFX9-NEXT: s_waitcnt lgkmcnt(0) 933; GFX9-NEXT: v_mov_b32_e32 v0, s0 934; GFX9-NEXT: v_mov_b32_e32 v1, s1 935; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 936; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 937; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 938; GFX9-NEXT: s_waitcnt vmcnt(0) 939; GFX9-NEXT: global_store_byte v[0:1], v0, off 940; GFX9-NEXT: s_endpgm 941; 942; GFX10-LABEL: global_inst_salu_offset_2x_neg_13bit_max: 943; GFX10: ; %bb.0: 944; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 945; GFX10-NEXT: s_waitcnt lgkmcnt(0) 946; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffc000, s0 947; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 948; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 949; GFX10-NEXT: s_waitcnt vmcnt(0) 950; GFX10-NEXT: global_store_byte v[0:1], v0, off 951; GFX10-NEXT: s_endpgm 952 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 953 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 954 store i8 %load, i8 addrspace(1)* undef 955 ret void 956} 957 958; Fill 11-bit low-bits (1ull << 33) | 2047 959define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) { 960; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: 961; GFX9: ; %bb.0: 962; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 963; GFX9-NEXT: s_waitcnt lgkmcnt(0) 964; GFX9-NEXT: v_mov_b32_e32 v1, s1 965; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 966; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 967; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 968; GFX9-NEXT: s_waitcnt vmcnt(0) 969; GFX9-NEXT: global_store_byte v[0:1], v0, off 970; GFX9-NEXT: s_endpgm 971; 972; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split0: 973; GFX10: ; %bb.0: 974; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 975; GFX10-NEXT: s_waitcnt lgkmcnt(0) 976; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0, s0 977; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 978; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 979; GFX10-NEXT: s_waitcnt vmcnt(0) 980; GFX10-NEXT: global_store_byte v[0:1], v0, off 981; GFX10-NEXT: s_endpgm 982 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 983 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 984 store i8 %load, i8 addrspace(1)* undef 985 ret void 986} 987 988; Fill 11-bit low-bits (1ull << 33) | 2048 989define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) { 990; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: 991; GFX9: ; %bb.0: 992; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 993; GFX9-NEXT: s_waitcnt lgkmcnt(0) 994; GFX9-NEXT: v_mov_b32_e32 v1, s1 995; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 996; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 997; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 998; GFX9-NEXT: s_waitcnt vmcnt(0) 999; GFX9-NEXT: global_store_byte v[0:1], v0, off 1000; GFX9-NEXT: s_endpgm 1001; 1002; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split1: 1003; GFX10: ; %bb.0: 1004; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1005; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 1007; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1008; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1009; GFX10-NEXT: s_waitcnt vmcnt(0) 1010; GFX10-NEXT: global_store_byte v[0:1], v0, off 1011; GFX10-NEXT: s_endpgm 1012 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 1013 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1014 store i8 %load, i8 addrspace(1)* undef 1015 ret void 1016} 1017 1018; Fill 12-bit low-bits (1ull << 33) | 4095 1019define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) { 1020; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: 1021; GFX9: ; %bb.0: 1022; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1023; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX9-NEXT: v_mov_b32_e32 v1, s1 1025; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1026; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1027; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 1028; GFX9-NEXT: s_waitcnt vmcnt(0) 1029; GFX9-NEXT: global_store_byte v[0:1], v0, off 1030; GFX9-NEXT: s_endpgm 1031; 1032; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split0: 1033; GFX10: ; %bb.0: 1034; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1035; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1036; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 1037; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1038; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1039; GFX10-NEXT: s_waitcnt vmcnt(0) 1040; GFX10-NEXT: global_store_byte v[0:1], v0, off 1041; GFX10-NEXT: s_endpgm 1042 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 1043 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1044 store i8 %load, i8 addrspace(1)* undef 1045 ret void 1046} 1047 1048; Fill 12-bit low-bits (1ull << 33) | 4096 1049define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) { 1050; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: 1051; GFX9: ; %bb.0: 1052; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1053; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1054; GFX9-NEXT: v_mov_b32_e32 v0, s0 1055; GFX9-NEXT: v_mov_b32_e32 v1, s1 1056; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1057; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1058; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1059; GFX9-NEXT: s_waitcnt vmcnt(0) 1060; GFX9-NEXT: global_store_byte v[0:1], v0, off 1061; GFX9-NEXT: s_endpgm 1062; 1063; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split1: 1064; GFX10: ; %bb.0: 1065; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1066; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1067; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1000, s0 1068; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1069; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1070; GFX10-NEXT: s_waitcnt vmcnt(0) 1071; GFX10-NEXT: global_store_byte v[0:1], v0, off 1072; GFX10-NEXT: s_endpgm 1073 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 1074 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1075 store i8 %load, i8 addrspace(1)* undef 1076 ret void 1077} 1078 1079; Fill 13-bit low-bits (1ull << 33) | 8191 1080define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) { 1081; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: 1082; GFX9: ; %bb.0: 1083; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1084; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1085; GFX9-NEXT: v_mov_b32_e32 v0, s0 1086; GFX9-NEXT: v_mov_b32_e32 v1, s1 1087; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1088; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1089; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 1090; GFX9-NEXT: s_waitcnt vmcnt(0) 1091; GFX9-NEXT: global_store_byte v[0:1], v0, off 1092; GFX9-NEXT: s_endpgm 1093; 1094; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split0: 1095; GFX10: ; %bb.0: 1096; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1097; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1098; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 1099; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1100; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1101; GFX10-NEXT: s_waitcnt vmcnt(0) 1102; GFX10-NEXT: global_store_byte v[0:1], v0, off 1103; GFX10-NEXT: s_endpgm 1104 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 1105 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1106 store i8 %load, i8 addrspace(1)* undef 1107 ret void 1108} 1109 1110; Fill 13-bit low-bits (1ull << 33) | 8192 1111define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) { 1112; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: 1113; GFX9: ; %bb.0: 1114; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1115; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX9-NEXT: v_mov_b32_e32 v0, s0 1117; GFX9-NEXT: v_mov_b32_e32 v1, s1 1118; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1119; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1120; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1121; GFX9-NEXT: s_waitcnt vmcnt(0) 1122; GFX9-NEXT: global_store_byte v[0:1], v0, off 1123; GFX9-NEXT: s_endpgm 1124; 1125; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split1: 1126; GFX10: ; %bb.0: 1127; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1128; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1129; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x2000, s0 1130; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1131; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1132; GFX10-NEXT: s_waitcnt vmcnt(0) 1133; GFX10-NEXT: global_store_byte v[0:1], v0, off 1134; GFX10-NEXT: s_endpgm 1135 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 1136 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1137 store i8 %load, i8 addrspace(1)* undef 1138 ret void 1139} 1140 1141; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 1142define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) { 1143; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: 1144; GFX9: ; %bb.0: 1145; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1146; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1147; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1148; GFX9-NEXT: v_mov_b32_e32 v0, s0 1149; GFX9-NEXT: v_mov_b32_e32 v2, s1 1150; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1151; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1152; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 1153; GFX9-NEXT: s_waitcnt vmcnt(0) 1154; GFX9-NEXT: global_store_byte v[0:1], v0, off 1155; GFX9-NEXT: s_endpgm 1156; 1157; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: 1158; GFX10: ; %bb.0: 1159; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1160; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1161; GFX10-NEXT: v_mov_b32_e32 v1, s1 1162; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 1163; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1164; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1165; GFX10-NEXT: s_waitcnt vmcnt(0) 1166; GFX10-NEXT: global_store_byte v[0:1], v0, off 1167; GFX10-NEXT: s_endpgm 1168 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 1169 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1170 store i8 %load, i8 addrspace(1)* undef 1171 ret void 1172} 1173 1174; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 1175define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) { 1176; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: 1177; GFX9: ; %bb.0: 1178; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1179; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1180; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1181; GFX9-NEXT: v_mov_b32_e32 v0, s0 1182; GFX9-NEXT: v_mov_b32_e32 v2, s1 1183; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1184; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1185; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 1186; GFX9-NEXT: s_waitcnt vmcnt(0) 1187; GFX9-NEXT: global_store_byte v[0:1], v0, off 1188; GFX9-NEXT: s_endpgm 1189; 1190; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: 1191; GFX10: ; %bb.0: 1192; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1193; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1194; GFX10-NEXT: v_mov_b32_e32 v1, s1 1195; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 1196; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1197; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1198; GFX10-NEXT: s_waitcnt vmcnt(0) 1199; GFX10-NEXT: global_store_byte v[0:1], v0, off 1200; GFX10-NEXT: s_endpgm 1201 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 1202 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1203 store i8 %load, i8 addrspace(1)* undef 1204 ret void 1205} 1206 1207; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 1208define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) { 1209; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: 1210; GFX9: ; %bb.0: 1211; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1212; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1213; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1214; GFX9-NEXT: v_mov_b32_e32 v0, s0 1215; GFX9-NEXT: v_mov_b32_e32 v2, s1 1216; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1217; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1218; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1219; GFX9-NEXT: s_waitcnt vmcnt(0) 1220; GFX9-NEXT: global_store_byte v[0:1], v0, off 1221; GFX9-NEXT: s_endpgm 1222; 1223; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: 1224; GFX10: ; %bb.0: 1225; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1226; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1227; GFX10-NEXT: v_mov_b32_e32 v1, s1 1228; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 1229; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1230; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1231; GFX10-NEXT: s_waitcnt vmcnt(0) 1232; GFX10-NEXT: global_store_byte v[0:1], v0, off 1233; GFX10-NEXT: s_endpgm 1234 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 1235 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1236 store i8 %load, i8 addrspace(1)* undef 1237 ret void 1238} 1239 1240; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 1241define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) { 1242; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: 1243; GFX9: ; %bb.0: 1244; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1245; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1246; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1247; GFX9-NEXT: v_mov_b32_e32 v0, s0 1248; GFX9-NEXT: v_mov_b32_e32 v2, s1 1249; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1250; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1251; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1252; GFX9-NEXT: s_waitcnt vmcnt(0) 1253; GFX9-NEXT: global_store_byte v[0:1], v0, off 1254; GFX9-NEXT: s_endpgm 1255; 1256; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: 1257; GFX10: ; %bb.0: 1258; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1259; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1260; GFX10-NEXT: v_mov_b32_e32 v1, s1 1261; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 1262; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1263; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1264; GFX10-NEXT: s_waitcnt vmcnt(0) 1265; GFX10-NEXT: global_store_byte v[0:1], v0, off 1266; GFX10-NEXT: s_endpgm 1267 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 1268 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1269 store i8 %load, i8 addrspace(1)* undef 1270 ret void 1271} 1272 1273; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 1274define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) { 1275; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: 1276; GFX9: ; %bb.0: 1277; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1278; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1279; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1280; GFX9-NEXT: v_mov_b32_e32 v0, s0 1281; GFX9-NEXT: v_mov_b32_e32 v2, s1 1282; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1283; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1284; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1285; GFX9-NEXT: s_waitcnt vmcnt(0) 1286; GFX9-NEXT: global_store_byte v[0:1], v0, off 1287; GFX9-NEXT: s_endpgm 1288; 1289; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: 1290; GFX10: ; %bb.0: 1291; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1292; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX10-NEXT: v_mov_b32_e32 v1, s1 1294; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 1295; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1296; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1297; GFX10-NEXT: s_waitcnt vmcnt(0) 1298; GFX10-NEXT: global_store_byte v[0:1], v0, off 1299; GFX10-NEXT: s_endpgm 1300 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 1301 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1302 store i8 %load, i8 addrspace(1)* undef 1303 ret void 1304} 1305 1306; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 1307define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) { 1308; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: 1309; GFX9: ; %bb.0: 1310; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1311; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1312; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1313; GFX9-NEXT: v_mov_b32_e32 v0, s0 1314; GFX9-NEXT: v_mov_b32_e32 v2, s1 1315; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1316; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1317; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1318; GFX9-NEXT: s_waitcnt vmcnt(0) 1319; GFX9-NEXT: global_store_byte v[0:1], v0, off 1320; GFX9-NEXT: s_endpgm 1321; 1322; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: 1323; GFX10: ; %bb.0: 1324; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1325; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1326; GFX10-NEXT: v_mov_b32_e32 v1, s1 1327; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 1328; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1329; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1330; GFX10-NEXT: s_waitcnt vmcnt(0) 1331; GFX10-NEXT: global_store_byte v[0:1], v0, off 1332; GFX10-NEXT: s_endpgm 1333 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 1334 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1335 store i8 %load, i8 addrspace(1)* undef 1336 ret void 1337} 1338