1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4 5; Test splitting flat instruction offsets into the low and high bits 6; when the offset doesn't fit in the offset field. 7 8define i8 @flat_inst_valu_offset_1(i8* %p) { 9; GFX9-LABEL: flat_inst_valu_offset_1: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 13; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14; GFX9-NEXT: s_setpc_b64 s[30:31] 15; 16; GFX10-LABEL: flat_inst_valu_offset_1: 17; GFX10: ; %bb.0: 18; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 20; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 1 21; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 22; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 23; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 24; GFX10-NEXT: s_setpc_b64 s[30:31] 25 %gep = getelementptr i8, i8* %p, i64 1 26 %load = load i8, i8* %gep, align 4 27 ret i8 %load 28} 29 30define i8 @flat_inst_valu_offset_11bit_max(i8* %p) { 31; GFX9-LABEL: flat_inst_valu_offset_11bit_max: 32; GFX9: ; %bb.0: 33; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 35; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 36; GFX9-NEXT: s_setpc_b64 s[30:31] 37; 38; GFX10-LABEL: flat_inst_valu_offset_11bit_max: 39; GFX10: ; %bb.0: 40; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 42; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 43; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 44; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 45; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 46; GFX10-NEXT: s_setpc_b64 s[30:31] 47 %gep = getelementptr i8, i8* %p, i64 2047 48 %load = load i8, i8* %gep, align 4 49 ret i8 %load 50} 51 52define i8 @flat_inst_valu_offset_12bit_max(i8* %p) { 53; GFX9-LABEL: flat_inst_valu_offset_12bit_max: 54; GFX9: ; %bb.0: 55; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 57; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 58; GFX9-NEXT: s_setpc_b64 s[30:31] 59; 60; GFX10-LABEL: flat_inst_valu_offset_12bit_max: 61; GFX10: ; %bb.0: 62; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 63; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 64; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 65; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 66; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 67; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 68; GFX10-NEXT: s_setpc_b64 s[30:31] 69 %gep = getelementptr i8, i8* %p, i64 4095 70 %load = load i8, i8* %gep, align 4 71 ret i8 %load 72} 73 74define i8 @flat_inst_valu_offset_13bit_max(i8* %p) { 75; GFX9-LABEL: flat_inst_valu_offset_13bit_max: 76; GFX9: ; %bb.0: 77; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 78; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 79; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 80; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 81; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 82; GFX9-NEXT: s_setpc_b64 s[30:31] 83; 84; GFX10-LABEL: flat_inst_valu_offset_13bit_max: 85; GFX10: ; %bb.0: 86; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 87; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 88; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 89; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 90; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 91; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 92; GFX10-NEXT: s_setpc_b64 s[30:31] 93 %gep = getelementptr i8, i8* %p, i64 8191 94 %load = load i8, i8* %gep, align 4 95 ret i8 %load 96} 97 98define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) { 99; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max: 100; GFX9: ; %bb.0: 101; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 103; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 104; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 105; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 106; GFX9-NEXT: s_setpc_b64 s[30:31] 107; 108; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max: 109; GFX10: ; %bb.0: 110; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 111; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 112; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0 113; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 114; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 115; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 116; GFX10-NEXT: s_setpc_b64 s[30:31] 117 %gep = getelementptr i8, i8* %p, i64 -2048 118 %load = load i8, i8* %gep, align 4 119 ret i8 %load 120} 121 122define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) { 123; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max: 124; GFX9: ; %bb.0: 125; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 126; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 127; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 128; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 129; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 130; GFX9-NEXT: s_setpc_b64 s[30:31] 131; 132; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max: 133; GFX10: ; %bb.0: 134; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 135; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 136; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 137; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 138; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 139; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 140; GFX10-NEXT: s_setpc_b64 s[30:31] 141 %gep = getelementptr i8, i8* %p, i64 -4096 142 %load = load i8, i8* %gep, align 4 143 ret i8 %load 144} 145 146define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) { 147; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max: 148; GFX9: ; %bb.0: 149; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 150; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 151; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 152; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 153; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 154; GFX9-NEXT: s_setpc_b64 s[30:31] 155; 156; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max: 157; GFX10: ; %bb.0: 158; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 159; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 160; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 161; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 162; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 163; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 164; GFX10-NEXT: s_setpc_b64 s[30:31] 165 %gep = getelementptr i8, i8* %p, i64 -8192 166 %load = load i8, i8* %gep, align 4 167 ret i8 %load 168} 169 170define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) { 171; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max: 172; GFX9: ; %bb.0: 173; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 174; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 175; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 176; GFX9-NEXT: s_setpc_b64 s[30:31] 177; 178; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max: 179; GFX10: ; %bb.0: 180; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 181; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 182; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 183; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 184; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 185; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 186; GFX10-NEXT: s_setpc_b64 s[30:31] 187 %gep = getelementptr i8, i8* %p, i64 4095 188 %load = load i8, i8* %gep, align 4 189 ret i8 %load 190} 191 192define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) { 193; GFX9-LABEL: flat_inst_valu_offset_2x_12bit_max: 194; GFX9: ; %bb.0: 195; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 196; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 197; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 198; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 199; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 200; GFX9-NEXT: s_setpc_b64 s[30:31] 201; 202; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max: 203; GFX10: ; %bb.0: 204; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 205; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 206; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 207; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 208; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 209; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 210; GFX10-NEXT: s_setpc_b64 s[30:31] 211 %gep = getelementptr i8, i8* %p, i64 8191 212 %load = load i8, i8* %gep, align 4 213 ret i8 %load 214} 215 216define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) { 217; GFX9-LABEL: flat_inst_valu_offset_2x_13bit_max: 218; GFX9: ; %bb.0: 219; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 220; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 221; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 222; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 223; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 224; GFX9-NEXT: s_setpc_b64 s[30:31] 225; 226; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max: 227; GFX10: ; %bb.0: 228; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 229; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 230; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3fff, v0 231; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 232; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 233; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 234; GFX10-NEXT: s_setpc_b64 s[30:31] 235 %gep = getelementptr i8, i8* %p, i64 16383 236 %load = load i8, i8* %gep, align 4 237 ret i8 %load 238} 239 240define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) { 241; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: 242; GFX9: ; %bb.0: 243; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 244; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 245; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 246; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 247; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 248; GFX9-NEXT: s_setpc_b64 s[30:31] 249; 250; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: 251; GFX10: ; %bb.0: 252; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 253; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 254; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 255; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 256; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 257; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 258; GFX10-NEXT: s_setpc_b64 s[30:31] 259 %gep = getelementptr i8, i8* %p, i64 -4096 260 %load = load i8, i8* %gep, align 4 261 ret i8 %load 262} 263 264define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) { 265; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: 266; GFX9: ; %bb.0: 267; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 268; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 269; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 270; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 271; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 272; GFX9-NEXT: s_setpc_b64 s[30:31] 273; 274; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: 275; GFX10: ; %bb.0: 276; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 277; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 278; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 279; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 280; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 281; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 282; GFX10-NEXT: s_setpc_b64 s[30:31] 283 %gep = getelementptr i8, i8* %p, i64 -8192 284 %load = load i8, i8* %gep, align 4 285 ret i8 %load 286} 287 288define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) { 289; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: 290; GFX9: ; %bb.0: 291; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 293; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 294; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 295; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 296; GFX9-NEXT: s_setpc_b64 s[30:31] 297; 298; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: 299; GFX10: ; %bb.0: 300; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 301; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 302; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0 303; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 304; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 305; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 306; GFX10-NEXT: s_setpc_b64 s[30:31] 307 %gep = getelementptr i8, i8* %p, i64 -16384 308 %load = load i8, i8* %gep, align 4 309 ret i8 %load 310} 311 312; Fill 11-bit low-bits (1ull << 33) | 2047 313define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) { 314; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split0: 315; GFX9: ; %bb.0: 316; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 317; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 318; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 319; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 320; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 321; GFX9-NEXT: s_setpc_b64 s[30:31] 322; 323; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split0: 324; GFX10: ; %bb.0: 325; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 326; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 327; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 328; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 329; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 330; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 331; GFX10-NEXT: s_setpc_b64 s[30:31] 332 %gep = getelementptr i8, i8* %p, i64 8589936639 333 %load = load i8, i8* %gep, align 4 334 ret i8 %load 335} 336 337; Fill 11-bit low-bits (1ull << 33) | 2048 338define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) { 339; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split1: 340; GFX9: ; %bb.0: 341; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 342; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 343; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 344; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 345; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 346; GFX9-NEXT: s_setpc_b64 s[30:31] 347; 348; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split1: 349; GFX10: ; %bb.0: 350; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 351; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 352; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 353; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 354; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 355; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 356; GFX10-NEXT: s_setpc_b64 s[30:31] 357 %gep = getelementptr i8, i8* %p, i64 8589936640 358 %load = load i8, i8* %gep, align 4 359 ret i8 %load 360} 361 362; Fill 12-bit low-bits (1ull << 33) | 4095 363define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) { 364; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split0: 365; GFX9: ; %bb.0: 366; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 367; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 368; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 369; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 370; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 371; GFX9-NEXT: s_setpc_b64 s[30:31] 372; 373; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split0: 374; GFX10: ; %bb.0: 375; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 376; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 377; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 378; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 379; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 380; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 381; GFX10-NEXT: s_setpc_b64 s[30:31] 382 %gep = getelementptr i8, i8* %p, i64 8589938687 383 %load = load i8, i8* %gep, align 4 384 ret i8 %load 385} 386 387; Fill 12-bit low-bits (1ull << 33) | 4096 388define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) { 389; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1: 390; GFX9: ; %bb.0: 391; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 392; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 393; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 394; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 395; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 396; GFX9-NEXT: s_setpc_b64 s[30:31] 397; 398; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split1: 399; GFX10: ; %bb.0: 400; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 401; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 402; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 403; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 404; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 405; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 406; GFX10-NEXT: s_setpc_b64 s[30:31] 407 %gep = getelementptr i8, i8* %p, i64 8589938688 408 %load = load i8, i8* %gep, align 4 409 ret i8 %load 410} 411 412; Fill 13-bit low-bits (1ull << 33) | 8191 413define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) { 414; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split0: 415; GFX9: ; %bb.0: 416; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 417; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 418; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 419; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 420; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 421; GFX9-NEXT: s_setpc_b64 s[30:31] 422; 423; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split0: 424; GFX10: ; %bb.0: 425; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 426; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 427; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 428; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 429; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 430; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 431; GFX10-NEXT: s_setpc_b64 s[30:31] 432 %gep = getelementptr i8, i8* %p, i64 8589942783 433 %load = load i8, i8* %gep, align 4 434 ret i8 %load 435} 436 437; Fill 13-bit low-bits (1ull << 33) | 8192 438define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) { 439; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1: 440; GFX9: ; %bb.0: 441; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 442; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 443; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 444; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 445; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 446; GFX9-NEXT: s_setpc_b64 s[30:31] 447; 448; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split1: 449; GFX10: ; %bb.0: 450; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 451; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 452; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 453; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 454; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 455; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 456; GFX10-NEXT: s_setpc_b64 s[30:31] 457 %gep = getelementptr i8, i8* %p, i64 8589942784 458 %load = load i8, i8* %gep, align 4 459 ret i8 %load 460} 461 462; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 463define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) { 464; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: 465; GFX9: ; %bb.0: 466; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 467; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0 468; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 469; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 470; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 471; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 472; GFX9-NEXT: s_setpc_b64 s[30:31] 473; 474; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: 475; GFX10: ; %bb.0: 476; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 477; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 478; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 479; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 480; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 481; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 482; GFX10-NEXT: s_setpc_b64 s[30:31] 483 %gep = getelementptr i8, i8* %p, i64 -9223372036854773761 484 %load = load i8, i8* %gep, align 4 485 ret i8 %load 486} 487 488; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 489define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) { 490; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: 491; GFX9: ; %bb.0: 492; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 493; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0 494; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 495; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 496; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 497; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 498; GFX9-NEXT: s_setpc_b64 s[30:31] 499; 500; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: 501; GFX10: ; %bb.0: 502; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 503; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 504; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 505; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 506; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 507; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 508; GFX10-NEXT: s_setpc_b64 s[30:31] 509 %gep = getelementptr i8, i8* %p, i64 -9223372036854773760 510 %load = load i8, i8* %gep, align 4 511 ret i8 %load 512} 513 514; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 515define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) { 516; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: 517; GFX9: ; %bb.0: 518; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 519; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0 520; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 521; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 522; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 523; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 524; GFX9-NEXT: s_setpc_b64 s[30:31] 525; 526; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: 527; GFX10: ; %bb.0: 528; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 529; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 530; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 531; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 532; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 533; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 534; GFX10-NEXT: s_setpc_b64 s[30:31] 535 %gep = getelementptr i8, i8* %p, i64 -9223372036854771713 536 %load = load i8, i8* %gep, align 4 537 ret i8 %load 538} 539 540; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 541define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) { 542; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: 543; GFX9: ; %bb.0: 544; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 545; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 546; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 547; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 548; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 549; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 550; GFX9-NEXT: s_setpc_b64 s[30:31] 551; 552; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: 553; GFX10: ; %bb.0: 554; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 555; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 556; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 557; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 558; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 559; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 560; GFX10-NEXT: s_setpc_b64 s[30:31] 561 %gep = getelementptr i8, i8* %p, i64 -9223372036854771712 562 %load = load i8, i8* %gep, align 4 563 ret i8 %load 564} 565 566; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 567define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) { 568; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: 569; GFX9: ; %bb.0: 570; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 571; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 572; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 573; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 574; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 575; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 576; GFX9-NEXT: s_setpc_b64 s[30:31] 577; 578; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: 579; GFX10: ; %bb.0: 580; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 581; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 582; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 583; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 584; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 585; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 586; GFX10-NEXT: s_setpc_b64 s[30:31] 587 %gep = getelementptr i8, i8* %p, i64 -9223372036854767617 588 %load = load i8, i8* %gep, align 4 589 ret i8 %load 590} 591 592; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 593define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) { 594; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: 595; GFX9: ; %bb.0: 596; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 597; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 598; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 599; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 600; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 601; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 602; GFX9-NEXT: s_setpc_b64 s[30:31] 603; 604; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: 605; GFX10: ; %bb.0: 606; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 607; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 608; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 609; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 610; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 611; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 612; GFX10-NEXT: s_setpc_b64 s[30:31] 613 %gep = getelementptr i8, i8* %p, i64 -9223372036854767616 614 %load = load i8, i8* %gep, align 4 615 ret i8 %load 616} 617 618define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) { 619; GFX9-LABEL: flat_inst_salu_offset_1: 620; GFX9: ; %bb.0: 621; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 622; GFX9-NEXT: s_waitcnt lgkmcnt(0) 623; GFX9-NEXT: v_mov_b32_e32 v0, s0 624; GFX9-NEXT: v_mov_b32_e32 v1, s1 625; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 626; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 627; GFX9-NEXT: flat_store_byte v[0:1], v0 628; GFX9-NEXT: s_endpgm 629; 630; GFX10-LABEL: flat_inst_salu_offset_1: 631; GFX10: ; %bb.0: 632; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 633; GFX10-NEXT: s_waitcnt lgkmcnt(0) 634; GFX10-NEXT: s_add_u32 s0, s0, 1 635; GFX10-NEXT: s_addc_u32 s1, s1, 0 636; GFX10-NEXT: v_mov_b32_e32 v0, s0 637; GFX10-NEXT: v_mov_b32_e32 v1, s1 638; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 639; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 640; GFX10-NEXT: flat_store_byte v[0:1], v0 641; GFX10-NEXT: s_endpgm 642 %gep = getelementptr i8, i8* %p, i64 1 643 %load = load volatile i8, i8* %gep, align 1 644 store i8 %load, i8* undef 645 ret void 646} 647 648define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) { 649; GFX9-LABEL: flat_inst_salu_offset_11bit_max: 650; GFX9: ; %bb.0: 651; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 652; GFX9-NEXT: s_waitcnt lgkmcnt(0) 653; GFX9-NEXT: v_mov_b32_e32 v0, s0 654; GFX9-NEXT: v_mov_b32_e32 v1, s1 655; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 656; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 657; GFX9-NEXT: flat_store_byte v[0:1], v0 658; GFX9-NEXT: s_endpgm 659; 660; GFX10-LABEL: flat_inst_salu_offset_11bit_max: 661; GFX10: ; %bb.0: 662; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 663; GFX10-NEXT: s_waitcnt lgkmcnt(0) 664; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff 665; GFX10-NEXT: s_addc_u32 s1, s1, 0 666; GFX10-NEXT: v_mov_b32_e32 v0, s0 667; GFX10-NEXT: v_mov_b32_e32 v1, s1 668; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 669; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 670; GFX10-NEXT: flat_store_byte v[0:1], v0 671; GFX10-NEXT: s_endpgm 672 %gep = getelementptr i8, i8* %p, i64 2047 673 %load = load volatile i8, i8* %gep, align 1 674 store i8 %load, i8* undef 675 ret void 676} 677 678define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) { 679; GFX9-LABEL: flat_inst_salu_offset_12bit_max: 680; GFX9: ; %bb.0: 681; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 682; GFX9-NEXT: s_waitcnt lgkmcnt(0) 683; GFX9-NEXT: v_mov_b32_e32 v0, s0 684; GFX9-NEXT: v_mov_b32_e32 v1, s1 685; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 686; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 687; GFX9-NEXT: flat_store_byte v[0:1], v0 688; GFX9-NEXT: s_endpgm 689; 690; GFX10-LABEL: flat_inst_salu_offset_12bit_max: 691; GFX10: ; %bb.0: 692; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 693; GFX10-NEXT: s_waitcnt lgkmcnt(0) 694; GFX10-NEXT: s_add_u32 s0, s0, 0xfff 695; GFX10-NEXT: s_addc_u32 s1, s1, 0 696; GFX10-NEXT: v_mov_b32_e32 v0, s0 697; GFX10-NEXT: v_mov_b32_e32 v1, s1 698; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 699; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 700; GFX10-NEXT: flat_store_byte v[0:1], v0 701; GFX10-NEXT: s_endpgm 702 %gep = getelementptr i8, i8* %p, i64 4095 703 %load = load volatile i8, i8* %gep, align 1 704 store i8 %load, i8* undef 705 ret void 706} 707 708define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) { 709; GFX9-LABEL: flat_inst_salu_offset_13bit_max: 710; GFX9: ; %bb.0: 711; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 712; GFX9-NEXT: s_waitcnt lgkmcnt(0) 713; GFX9-NEXT: v_mov_b32_e32 v0, s0 714; GFX9-NEXT: v_mov_b32_e32 v1, s1 715; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 716; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 717; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 718; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 719; GFX9-NEXT: flat_store_byte v[0:1], v0 720; GFX9-NEXT: s_endpgm 721; 722; GFX10-LABEL: flat_inst_salu_offset_13bit_max: 723; GFX10: ; %bb.0: 724; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 725; GFX10-NEXT: s_waitcnt lgkmcnt(0) 726; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff 727; GFX10-NEXT: s_addc_u32 s1, s1, 0 728; GFX10-NEXT: v_mov_b32_e32 v0, s0 729; GFX10-NEXT: v_mov_b32_e32 v1, s1 730; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 731; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 732; GFX10-NEXT: flat_store_byte v[0:1], v0 733; GFX10-NEXT: s_endpgm 734 %gep = getelementptr i8, i8* %p, i64 8191 735 %load = load volatile i8, i8* %gep, align 1 736 store i8 %load, i8* undef 737 ret void 738} 739 740define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) { 741; GFX9-LABEL: flat_inst_salu_offset_neg_11bit_max: 742; GFX9: ; %bb.0: 743; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 744; GFX9-NEXT: s_waitcnt lgkmcnt(0) 745; GFX9-NEXT: v_mov_b32_e32 v0, s0 746; GFX9-NEXT: v_mov_b32_e32 v1, s1 747; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 748; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 749; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 750; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 751; GFX9-NEXT: flat_store_byte v[0:1], v0 752; GFX9-NEXT: s_endpgm 753; 754; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max: 755; GFX10: ; %bb.0: 756; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 757; GFX10-NEXT: s_waitcnt lgkmcnt(0) 758; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800 759; GFX10-NEXT: s_addc_u32 s1, s1, -1 760; GFX10-NEXT: v_mov_b32_e32 v0, s0 761; GFX10-NEXT: v_mov_b32_e32 v1, s1 762; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 763; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 764; GFX10-NEXT: flat_store_byte v[0:1], v0 765; GFX10-NEXT: s_endpgm 766 %gep = getelementptr i8, i8* %p, i64 -2048 767 %load = load volatile i8, i8* %gep, align 1 768 store i8 %load, i8* undef 769 ret void 770} 771 772define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) { 773; GFX9-LABEL: flat_inst_salu_offset_neg_12bit_max: 774; GFX9: ; %bb.0: 775; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 776; GFX9-NEXT: s_waitcnt lgkmcnt(0) 777; GFX9-NEXT: v_mov_b32_e32 v0, s0 778; GFX9-NEXT: v_mov_b32_e32 v1, s1 779; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 780; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 781; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 782; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 783; GFX9-NEXT: flat_store_byte v[0:1], v0 784; GFX9-NEXT: s_endpgm 785; 786; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max: 787; GFX10: ; %bb.0: 788; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 789; GFX10-NEXT: s_waitcnt lgkmcnt(0) 790; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 791; GFX10-NEXT: s_addc_u32 s1, s1, -1 792; GFX10-NEXT: v_mov_b32_e32 v0, s0 793; GFX10-NEXT: v_mov_b32_e32 v1, s1 794; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 795; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 796; GFX10-NEXT: flat_store_byte v[0:1], v0 797; GFX10-NEXT: s_endpgm 798 %gep = getelementptr i8, i8* %p, i64 -4096 799 %load = load volatile i8, i8* %gep, align 1 800 store i8 %load, i8* undef 801 ret void 802} 803 804define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) { 805; GFX9-LABEL: flat_inst_salu_offset_neg_13bit_max: 806; GFX9: ; %bb.0: 807; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 808; GFX9-NEXT: s_waitcnt lgkmcnt(0) 809; GFX9-NEXT: v_mov_b32_e32 v0, s0 810; GFX9-NEXT: v_mov_b32_e32 v1, s1 811; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 812; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 813; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 814; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 815; GFX9-NEXT: flat_store_byte v[0:1], v0 816; GFX9-NEXT: s_endpgm 817; 818; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max: 819; GFX10: ; %bb.0: 820; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 821; GFX10-NEXT: s_waitcnt lgkmcnt(0) 822; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 823; GFX10-NEXT: s_addc_u32 s1, s1, -1 824; GFX10-NEXT: v_mov_b32_e32 v0, s0 825; GFX10-NEXT: v_mov_b32_e32 v1, s1 826; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 827; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 828; GFX10-NEXT: flat_store_byte v[0:1], v0 829; GFX10-NEXT: s_endpgm 830 %gep = getelementptr i8, i8* %p, i64 -8192 831 %load = load volatile i8, i8* %gep, align 1 832 store i8 %load, i8* undef 833 ret void 834} 835 836define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) { 837; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max: 838; GFX9: ; %bb.0: 839; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 840; GFX9-NEXT: s_waitcnt lgkmcnt(0) 841; GFX9-NEXT: v_mov_b32_e32 v0, s0 842; GFX9-NEXT: v_mov_b32_e32 v1, s1 843; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 844; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 845; GFX9-NEXT: flat_store_byte v[0:1], v0 846; GFX9-NEXT: s_endpgm 847; 848; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max: 849; GFX10: ; %bb.0: 850; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 851; GFX10-NEXT: s_waitcnt lgkmcnt(0) 852; GFX10-NEXT: s_add_u32 s0, s0, 0xfff 853; GFX10-NEXT: s_addc_u32 s1, s1, 0 854; GFX10-NEXT: v_mov_b32_e32 v0, s0 855; GFX10-NEXT: v_mov_b32_e32 v1, s1 856; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 857; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 858; GFX10-NEXT: flat_store_byte v[0:1], v0 859; GFX10-NEXT: s_endpgm 860 %gep = getelementptr i8, i8* %p, i64 4095 861 %load = load volatile i8, i8* %gep, align 1 862 store i8 %load, i8* undef 863 ret void 864} 865 866define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) { 867; GFX9-LABEL: flat_inst_salu_offset_2x_12bit_max: 868; GFX9: ; %bb.0: 869; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 870; GFX9-NEXT: s_waitcnt lgkmcnt(0) 871; GFX9-NEXT: v_mov_b32_e32 v0, s0 872; GFX9-NEXT: v_mov_b32_e32 v1, s1 873; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 874; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 875; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 876; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 877; GFX9-NEXT: flat_store_byte v[0:1], v0 878; GFX9-NEXT: s_endpgm 879; 880; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max: 881; GFX10: ; %bb.0: 882; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 883; GFX10-NEXT: s_waitcnt lgkmcnt(0) 884; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff 885; GFX10-NEXT: s_addc_u32 s1, s1, 0 886; GFX10-NEXT: v_mov_b32_e32 v0, s0 887; GFX10-NEXT: v_mov_b32_e32 v1, s1 888; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 889; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 890; GFX10-NEXT: flat_store_byte v[0:1], v0 891; GFX10-NEXT: s_endpgm 892 %gep = getelementptr i8, i8* %p, i64 8191 893 %load = load volatile i8, i8* %gep, align 1 894 store i8 %load, i8* undef 895 ret void 896} 897 898define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) { 899; GFX9-LABEL: flat_inst_salu_offset_2x_13bit_max: 900; GFX9: ; %bb.0: 901; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 902; GFX9-NEXT: s_waitcnt lgkmcnt(0) 903; GFX9-NEXT: v_mov_b32_e32 v0, s0 904; GFX9-NEXT: v_mov_b32_e32 v1, s1 905; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 906; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 907; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 908; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 909; GFX9-NEXT: flat_store_byte v[0:1], v0 910; GFX9-NEXT: s_endpgm 911; 912; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max: 913; GFX10: ; %bb.0: 914; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 915; GFX10-NEXT: s_waitcnt lgkmcnt(0) 916; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff 917; GFX10-NEXT: s_addc_u32 s1, s1, 0 918; GFX10-NEXT: v_mov_b32_e32 v0, s0 919; GFX10-NEXT: v_mov_b32_e32 v1, s1 920; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 921; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 922; GFX10-NEXT: flat_store_byte v[0:1], v0 923; GFX10-NEXT: s_endpgm 924 %gep = getelementptr i8, i8* %p, i64 16383 925 %load = load volatile i8, i8* %gep, align 1 926 store i8 %load, i8* undef 927 ret void 928} 929 930define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) { 931; GFX9-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: 932; GFX9: ; %bb.0: 933; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 934; GFX9-NEXT: s_waitcnt lgkmcnt(0) 935; GFX9-NEXT: v_mov_b32_e32 v0, s0 936; GFX9-NEXT: v_mov_b32_e32 v1, s1 937; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 938; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 939; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 940; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 941; GFX9-NEXT: flat_store_byte v[0:1], v0 942; GFX9-NEXT: s_endpgm 943; 944; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: 945; GFX10: ; %bb.0: 946; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 947; GFX10-NEXT: s_waitcnt lgkmcnt(0) 948; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 949; GFX10-NEXT: s_addc_u32 s1, s1, -1 950; GFX10-NEXT: v_mov_b32_e32 v0, s0 951; GFX10-NEXT: v_mov_b32_e32 v1, s1 952; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 953; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 954; GFX10-NEXT: flat_store_byte v[0:1], v0 955; GFX10-NEXT: s_endpgm 956 %gep = getelementptr i8, i8* %p, i64 -4096 957 %load = load volatile i8, i8* %gep, align 1 958 store i8 %load, i8* undef 959 ret void 960} 961 962define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) { 963; GFX9-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: 964; GFX9: ; %bb.0: 965; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 966; GFX9-NEXT: s_waitcnt lgkmcnt(0) 967; GFX9-NEXT: v_mov_b32_e32 v0, s0 968; GFX9-NEXT: v_mov_b32_e32 v1, s1 969; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 970; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 971; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 972; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 973; GFX9-NEXT: flat_store_byte v[0:1], v0 974; GFX9-NEXT: s_endpgm 975; 976; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: 977; GFX10: ; %bb.0: 978; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 979; GFX10-NEXT: s_waitcnt lgkmcnt(0) 980; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 981; GFX10-NEXT: s_addc_u32 s1, s1, -1 982; GFX10-NEXT: v_mov_b32_e32 v0, s0 983; GFX10-NEXT: v_mov_b32_e32 v1, s1 984; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 985; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 986; GFX10-NEXT: flat_store_byte v[0:1], v0 987; GFX10-NEXT: s_endpgm 988 %gep = getelementptr i8, i8* %p, i64 -8192 989 %load = load volatile i8, i8* %gep, align 1 990 store i8 %load, i8* undef 991 ret void 992} 993 994define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) { 995; GFX9-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: 996; GFX9: ; %bb.0: 997; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 998; GFX9-NEXT: s_waitcnt lgkmcnt(0) 999; GFX9-NEXT: v_mov_b32_e32 v0, s0 1000; GFX9-NEXT: v_mov_b32_e32 v1, s1 1001; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 1002; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 1003; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1004; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1005; GFX9-NEXT: flat_store_byte v[0:1], v0 1006; GFX9-NEXT: s_endpgm 1007; 1008; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: 1009; GFX10: ; %bb.0: 1010; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1011; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1012; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000 1013; GFX10-NEXT: s_addc_u32 s1, s1, -1 1014; GFX10-NEXT: v_mov_b32_e32 v0, s0 1015; GFX10-NEXT: v_mov_b32_e32 v1, s1 1016; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1017; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1018; GFX10-NEXT: flat_store_byte v[0:1], v0 1019; GFX10-NEXT: s_endpgm 1020 %gep = getelementptr i8, i8* %p, i64 -16384 1021 %load = load volatile i8, i8* %gep, align 1 1022 store i8 %load, i8* undef 1023 ret void 1024} 1025 1026; Fill 11-bit low-bits (1ull << 33) | 2047 1027define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) { 1028; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split0: 1029; GFX9: ; %bb.0: 1030; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1031; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1032; GFX9-NEXT: v_mov_b32_e32 v1, s1 1033; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1034; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1035; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 1036; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1037; GFX9-NEXT: flat_store_byte v[0:1], v0 1038; GFX9-NEXT: s_endpgm 1039; 1040; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0: 1041; GFX10: ; %bb.0: 1042; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1043; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1044; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff 1045; GFX10-NEXT: s_addc_u32 s1, s1, 2 1046; GFX10-NEXT: v_mov_b32_e32 v0, s0 1047; GFX10-NEXT: v_mov_b32_e32 v1, s1 1048; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1049; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1050; GFX10-NEXT: flat_store_byte v[0:1], v0 1051; GFX10-NEXT: s_endpgm 1052 %gep = getelementptr i8, i8* %p, i64 8589936639 1053 %load = load volatile i8, i8* %gep, align 1 1054 store i8 %load, i8* undef 1055 ret void 1056} 1057 1058; Fill 11-bit low-bits (1ull << 33) | 2048 1059define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) { 1060; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split1: 1061; GFX9: ; %bb.0: 1062; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1063; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1064; GFX9-NEXT: v_mov_b32_e32 v1, s1 1065; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1066; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1067; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 1068; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1069; GFX9-NEXT: flat_store_byte v[0:1], v0 1070; GFX9-NEXT: s_endpgm 1071; 1072; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1: 1073; GFX10: ; %bb.0: 1074; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1075; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1076; GFX10-NEXT: s_add_u32 s0, s0, 0x800 1077; GFX10-NEXT: s_addc_u32 s1, s1, 2 1078; GFX10-NEXT: v_mov_b32_e32 v0, s0 1079; GFX10-NEXT: v_mov_b32_e32 v1, s1 1080; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1081; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1082; GFX10-NEXT: flat_store_byte v[0:1], v0 1083; GFX10-NEXT: s_endpgm 1084 %gep = getelementptr i8, i8* %p, i64 8589936640 1085 %load = load volatile i8, i8* %gep, align 1 1086 store i8 %load, i8* undef 1087 ret void 1088} 1089 1090; Fill 12-bit low-bits (1ull << 33) | 4095 1091define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) { 1092; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split0: 1093; GFX9: ; %bb.0: 1094; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1095; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1096; GFX9-NEXT: v_mov_b32_e32 v1, s1 1097; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1098; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1099; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 1100; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1101; GFX9-NEXT: flat_store_byte v[0:1], v0 1102; GFX9-NEXT: s_endpgm 1103; 1104; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0: 1105; GFX10: ; %bb.0: 1106; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1107; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1108; GFX10-NEXT: s_add_u32 s0, s0, 0xfff 1109; GFX10-NEXT: s_addc_u32 s1, s1, 2 1110; GFX10-NEXT: v_mov_b32_e32 v0, s0 1111; GFX10-NEXT: v_mov_b32_e32 v1, s1 1112; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1113; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1114; GFX10-NEXT: flat_store_byte v[0:1], v0 1115; GFX10-NEXT: s_endpgm 1116 %gep = getelementptr i8, i8* %p, i64 8589938687 1117 %load = load volatile i8, i8* %gep, align 1 1118 store i8 %load, i8* undef 1119 ret void 1120} 1121 1122; Fill 12-bit low-bits (1ull << 33) | 4096 1123define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) { 1124; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split1: 1125; GFX9: ; %bb.0: 1126; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1127; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1128; GFX9-NEXT: v_mov_b32_e32 v0, s0 1129; GFX9-NEXT: v_mov_b32_e32 v1, s1 1130; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1131; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1132; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1133; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1134; GFX9-NEXT: flat_store_byte v[0:1], v0 1135; GFX9-NEXT: s_endpgm 1136; 1137; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1: 1138; GFX10: ; %bb.0: 1139; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1140; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 1142; GFX10-NEXT: s_addc_u32 s1, s1, 2 1143; GFX10-NEXT: v_mov_b32_e32 v0, s0 1144; GFX10-NEXT: v_mov_b32_e32 v1, s1 1145; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1146; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1147; GFX10-NEXT: flat_store_byte v[0:1], v0 1148; GFX10-NEXT: s_endpgm 1149 %gep = getelementptr i8, i8* %p, i64 8589938688 1150 %load = load volatile i8, i8* %gep, align 1 1151 store i8 %load, i8* undef 1152 ret void 1153} 1154 1155; Fill 13-bit low-bits (1ull << 33) | 8191 1156define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) { 1157; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split0: 1158; GFX9: ; %bb.0: 1159; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1160; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1161; GFX9-NEXT: v_mov_b32_e32 v0, s0 1162; GFX9-NEXT: v_mov_b32_e32 v1, s1 1163; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1164; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1165; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 1166; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1167; GFX9-NEXT: flat_store_byte v[0:1], v0 1168; GFX9-NEXT: s_endpgm 1169; 1170; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0: 1171; GFX10: ; %bb.0: 1172; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1173; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1174; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff 1175; GFX10-NEXT: s_addc_u32 s1, s1, 2 1176; GFX10-NEXT: v_mov_b32_e32 v0, s0 1177; GFX10-NEXT: v_mov_b32_e32 v1, s1 1178; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1179; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1180; GFX10-NEXT: flat_store_byte v[0:1], v0 1181; GFX10-NEXT: s_endpgm 1182 %gep = getelementptr i8, i8* %p, i64 8589942783 1183 %load = load volatile i8, i8* %gep, align 1 1184 store i8 %load, i8* undef 1185 ret void 1186} 1187 1188; Fill 13-bit low-bits (1ull << 33) | 8192 1189define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) { 1190; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split1: 1191; GFX9: ; %bb.0: 1192; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1193; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1194; GFX9-NEXT: v_mov_b32_e32 v0, s0 1195; GFX9-NEXT: v_mov_b32_e32 v1, s1 1196; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1197; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1198; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1199; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1200; GFX9-NEXT: flat_store_byte v[0:1], v0 1201; GFX9-NEXT: s_endpgm 1202; 1203; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1: 1204; GFX10: ; %bb.0: 1205; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1206; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1207; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 1208; GFX10-NEXT: s_addc_u32 s1, s1, 2 1209; GFX10-NEXT: v_mov_b32_e32 v0, s0 1210; GFX10-NEXT: v_mov_b32_e32 v1, s1 1211; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1212; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1213; GFX10-NEXT: flat_store_byte v[0:1], v0 1214; GFX10-NEXT: s_endpgm 1215 %gep = getelementptr i8, i8* %p, i64 8589942784 1216 %load = load volatile i8, i8* %gep, align 1 1217 store i8 %load, i8* undef 1218 ret void 1219} 1220 1221; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 1222define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* %p) { 1223; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: 1224; GFX9: ; %bb.0: 1225; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1226; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1227; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1228; GFX9-NEXT: v_mov_b32_e32 v0, s0 1229; GFX9-NEXT: v_mov_b32_e32 v2, s1 1230; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0 1231; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1232; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1233; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1234; GFX9-NEXT: flat_store_byte v[0:1], v0 1235; GFX9-NEXT: s_endpgm 1236; 1237; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: 1238; GFX10: ; %bb.0: 1239; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1240; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1241; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff 1242; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1243; GFX10-NEXT: v_mov_b32_e32 v0, s0 1244; GFX10-NEXT: v_mov_b32_e32 v1, s1 1245; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1246; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1247; GFX10-NEXT: flat_store_byte v[0:1], v0 1248; GFX10-NEXT: s_endpgm 1249 %gep = getelementptr i8, i8* %p, i64 -9223372036854773761 1250 %load = load volatile i8, i8* %gep, align 1 1251 store i8 %load, i8* undef 1252 ret void 1253} 1254 1255; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 1256define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* %p) { 1257; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: 1258; GFX9: ; %bb.0: 1259; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1260; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1262; GFX9-NEXT: v_mov_b32_e32 v0, s0 1263; GFX9-NEXT: v_mov_b32_e32 v2, s1 1264; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0 1265; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1266; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1267; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1268; GFX9-NEXT: flat_store_byte v[0:1], v0 1269; GFX9-NEXT: s_endpgm 1270; 1271; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: 1272; GFX10: ; %bb.0: 1273; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1274; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1275; GFX10-NEXT: s_add_u32 s0, s0, 0x800 1276; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1277; GFX10-NEXT: v_mov_b32_e32 v0, s0 1278; GFX10-NEXT: v_mov_b32_e32 v1, s1 1279; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1280; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1281; GFX10-NEXT: flat_store_byte v[0:1], v0 1282; GFX10-NEXT: s_endpgm 1283 %gep = getelementptr i8, i8* %p, i64 -9223372036854773760 1284 %load = load volatile i8, i8* %gep, align 1 1285 store i8 %load, i8* undef 1286 ret void 1287} 1288 1289; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 1290define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* %p) { 1291; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: 1292; GFX9: ; %bb.0: 1293; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1294; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1295; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1296; GFX9-NEXT: v_mov_b32_e32 v0, s0 1297; GFX9-NEXT: v_mov_b32_e32 v2, s1 1298; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0 1299; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1300; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1301; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1302; GFX9-NEXT: flat_store_byte v[0:1], v0 1303; GFX9-NEXT: s_endpgm 1304; 1305; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: 1306; GFX10: ; %bb.0: 1307; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1308; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1309; GFX10-NEXT: s_add_u32 s0, s0, 0xfff 1310; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1311; GFX10-NEXT: v_mov_b32_e32 v0, s0 1312; GFX10-NEXT: v_mov_b32_e32 v1, s1 1313; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1314; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1315; GFX10-NEXT: flat_store_byte v[0:1], v0 1316; GFX10-NEXT: s_endpgm 1317 %gep = getelementptr i8, i8* %p, i64 -9223372036854771713 1318 %load = load volatile i8, i8* %gep, align 1 1319 store i8 %load, i8* undef 1320 ret void 1321} 1322 1323; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 1324define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* %p) { 1325; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: 1326; GFX9: ; %bb.0: 1327; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1328; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1329; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1330; GFX9-NEXT: v_mov_b32_e32 v0, s0 1331; GFX9-NEXT: v_mov_b32_e32 v2, s1 1332; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1333; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1334; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1335; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1336; GFX9-NEXT: flat_store_byte v[0:1], v0 1337; GFX9-NEXT: s_endpgm 1338; 1339; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: 1340; GFX10: ; %bb.0: 1341; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1342; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 1344; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1345; GFX10-NEXT: v_mov_b32_e32 v0, s0 1346; GFX10-NEXT: v_mov_b32_e32 v1, s1 1347; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1348; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1349; GFX10-NEXT: flat_store_byte v[0:1], v0 1350; GFX10-NEXT: s_endpgm 1351 %gep = getelementptr i8, i8* %p, i64 -9223372036854771712 1352 %load = load volatile i8, i8* %gep, align 1 1353 store i8 %load, i8* undef 1354 ret void 1355} 1356 1357; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 1358define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* %p) { 1359; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: 1360; GFX9: ; %bb.0: 1361; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1362; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1363; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX9-NEXT: v_mov_b32_e32 v0, s0 1365; GFX9-NEXT: v_mov_b32_e32 v2, s1 1366; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 1367; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1368; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1369; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1370; GFX9-NEXT: flat_store_byte v[0:1], v0 1371; GFX9-NEXT: s_endpgm 1372; 1373; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: 1374; GFX10: ; %bb.0: 1375; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1376; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1377; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff 1378; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1379; GFX10-NEXT: v_mov_b32_e32 v0, s0 1380; GFX10-NEXT: v_mov_b32_e32 v1, s1 1381; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1382; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1383; GFX10-NEXT: flat_store_byte v[0:1], v0 1384; GFX10-NEXT: s_endpgm 1385 %gep = getelementptr i8, i8* %p, i64 -9223372036854767617 1386 %load = load volatile i8, i8* %gep, align 1 1387 store i8 %load, i8* undef 1388 ret void 1389} 1390 1391; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 1392define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* %p) { 1393; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: 1394; GFX9: ; %bb.0: 1395; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1396; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1397; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1398; GFX9-NEXT: v_mov_b32_e32 v0, s0 1399; GFX9-NEXT: v_mov_b32_e32 v2, s1 1400; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1401; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1402; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1403; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1404; GFX9-NEXT: flat_store_byte v[0:1], v0 1405; GFX9-NEXT: s_endpgm 1406; 1407; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: 1408; GFX10: ; %bb.0: 1409; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1410; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 1412; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1413; GFX10-NEXT: v_mov_b32_e32 v0, s0 1414; GFX10-NEXT: v_mov_b32_e32 v1, s1 1415; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1416; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1417; GFX10-NEXT: flat_store_byte v[0:1], v0 1418; GFX10-NEXT: s_endpgm 1419 %gep = getelementptr i8, i8* %p, i64 -9223372036854767616 1420 %load = load volatile i8, i8* %gep, align 1 1421 store i8 %load, i8* undef 1422 ret void 1423} 1424