1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX9 5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX10 6 7; Test that add/sub with a constant is swapped to sub/add with negated 8; constant to minimize code size. 9 10define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 11; SI-LABEL: v_test_i32_x_sub_64: 12; SI: ; %bb.0: 13; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 14; SI-NEXT: s_mov_b32 s7, 0xf000 15; SI-NEXT: s_mov_b32 s6, 0 16; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 17; SI-NEXT: v_mov_b32_e32 v1, 0 18; SI-NEXT: s_waitcnt lgkmcnt(0) 19; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 20; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 21; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 24; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: v_test_i32_x_sub_64: 28; VI: ; %bb.0: 29; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 30; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 31; VI-NEXT: s_waitcnt lgkmcnt(0) 32; VI-NEXT: v_mov_b32_e32 v1, s3 33; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 34; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 35; VI-NEXT: flat_load_dword v0, v[0:1] 36; VI-NEXT: v_mov_b32_e32 v3, s1 37; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 38; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 39; VI-NEXT: s_waitcnt vmcnt(0) 40; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 41; VI-NEXT: flat_store_dword v[2:3], v0 42; VI-NEXT: s_endpgm 43; 44; GFX9-LABEL: v_test_i32_x_sub_64: 45; GFX9: ; %bb.0: 46; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 47; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 48; GFX9-NEXT: s_waitcnt lgkmcnt(0) 49; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 50; GFX9-NEXT: s_waitcnt vmcnt(0) 51; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 52; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 53; GFX9-NEXT: s_endpgm 54; 55; GFX10-LABEL: v_test_i32_x_sub_64: 56; GFX10: ; %bb.0: 57; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 58; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 59; GFX10-NEXT: s_waitcnt lgkmcnt(0) 60; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 61; GFX10-NEXT: s_waitcnt vmcnt(0) 62; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 63; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 64; GFX10-NEXT: s_endpgm 65 %tid = call i32 @llvm.amdgcn.workitem.id.x() 66 %tid.ext = sext i32 %tid to i64 67 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 68 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 69 %x = load i32, i32 addrspace(1)* %gep 70 %result = sub i32 %x, 64 71 store i32 %result, i32 addrspace(1)* %gep.out 72 ret void 73} 74 75define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 76; SI-LABEL: v_test_i32_x_sub_64_multi_use: 77; SI: ; %bb.0: 78; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 79; SI-NEXT: s_mov_b32 s7, 0xf000 80; SI-NEXT: s_mov_b32 s6, 0 81; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 82; SI-NEXT: v_mov_b32_e32 v1, 0 83; SI-NEXT: s_waitcnt lgkmcnt(0) 84; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 85; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 86; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 87; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 88; SI-NEXT: s_waitcnt vmcnt(1) 89; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 90; SI-NEXT: s_waitcnt vmcnt(0) 91; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 92; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 93; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 94; SI-NEXT: s_endpgm 95; 96; VI-LABEL: v_test_i32_x_sub_64_multi_use: 97; VI: ; %bb.0: 98; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 99; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 100; VI-NEXT: s_waitcnt lgkmcnt(0) 101; VI-NEXT: v_mov_b32_e32 v1, s3 102; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 103; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 104; VI-NEXT: flat_load_dword v3, v[0:1] 105; VI-NEXT: flat_load_dword v4, v[0:1] 106; VI-NEXT: v_mov_b32_e32 v1, s1 107; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 108; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 109; VI-NEXT: s_waitcnt vmcnt(1) 110; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 111; VI-NEXT: s_waitcnt vmcnt(0) 112; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4 113; VI-NEXT: flat_store_dword v[0:1], v2 114; VI-NEXT: flat_store_dword v[0:1], v3 115; VI-NEXT: s_endpgm 116; 117; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: 118; GFX9: ; %bb.0: 119; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 120; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 121; GFX9-NEXT: s_waitcnt lgkmcnt(0) 122; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 123; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 124; GFX9-NEXT: s_waitcnt vmcnt(1) 125; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 126; GFX9-NEXT: s_waitcnt vmcnt(0) 127; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2 128; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 129; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 130; GFX9-NEXT: s_endpgm 131; 132; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: 133; GFX10: ; %bb.0: 134; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 135; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 136; GFX10-NEXT: s_waitcnt lgkmcnt(0) 137; GFX10-NEXT: s_clause 0x1 138; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 139; GFX10-NEXT: global_load_dword v2, v0, s[2:3] 140; GFX10-NEXT: s_waitcnt vmcnt(1) 141; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 142; GFX10-NEXT: s_waitcnt vmcnt(0) 143; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 144; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 145; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 146; GFX10-NEXT: s_endpgm 147 %tid = call i32 @llvm.amdgcn.workitem.id.x() 148 %tid.ext = sext i32 %tid to i64 149 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 150 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 151 %x = load volatile i32, i32 addrspace(1)* %gep 152 %y = load volatile i32, i32 addrspace(1)* %gep 153 %result0 = sub i32 %x, 64 154 %result1 = sub i32 %y, 64 155 store volatile i32 %result0, i32 addrspace(1)* %gep.out 156 store volatile i32 %result1, i32 addrspace(1)* %gep.out 157 ret void 158} 159 160define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 161; SI-LABEL: v_test_i32_64_sub_x: 162; SI: ; %bb.0: 163; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 164; SI-NEXT: s_mov_b32 s7, 0xf000 165; SI-NEXT: s_mov_b32 s6, 0 166; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 167; SI-NEXT: v_mov_b32_e32 v1, 0 168; SI-NEXT: s_waitcnt lgkmcnt(0) 169; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 170; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 171; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 172; SI-NEXT: s_waitcnt vmcnt(0) 173; SI-NEXT: v_sub_i32_e32 v2, vcc, 64, v2 174; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 175; SI-NEXT: s_endpgm 176; 177; VI-LABEL: v_test_i32_64_sub_x: 178; VI: ; %bb.0: 179; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 180; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 181; VI-NEXT: s_waitcnt lgkmcnt(0) 182; VI-NEXT: v_mov_b32_e32 v1, s3 183; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 184; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 185; VI-NEXT: flat_load_dword v0, v[0:1] 186; VI-NEXT: v_mov_b32_e32 v3, s1 187; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 188; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 189; VI-NEXT: s_waitcnt vmcnt(0) 190; VI-NEXT: v_sub_u32_e32 v0, vcc, 64, v0 191; VI-NEXT: flat_store_dword v[2:3], v0 192; VI-NEXT: s_endpgm 193; 194; GFX9-LABEL: v_test_i32_64_sub_x: 195; GFX9: ; %bb.0: 196; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 197; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 198; GFX9-NEXT: s_waitcnt lgkmcnt(0) 199; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 200; GFX9-NEXT: s_waitcnt vmcnt(0) 201; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1 202; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 203; GFX9-NEXT: s_endpgm 204; 205; GFX10-LABEL: v_test_i32_64_sub_x: 206; GFX10: ; %bb.0: 207; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 208; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 209; GFX10-NEXT: s_waitcnt lgkmcnt(0) 210; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 211; GFX10-NEXT: s_waitcnt vmcnt(0) 212; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1 213; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 214; GFX10-NEXT: s_endpgm 215 %tid = call i32 @llvm.amdgcn.workitem.id.x() 216 %tid.ext = sext i32 %tid to i64 217 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 218 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 219 %x = load i32, i32 addrspace(1)* %gep 220 %result = sub i32 64, %x 221 store i32 %result, i32 addrspace(1)* %gep.out 222 ret void 223} 224 225define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 226; SI-LABEL: v_test_i32_x_sub_65: 227; SI: ; %bb.0: 228; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 229; SI-NEXT: s_mov_b32 s7, 0xf000 230; SI-NEXT: s_mov_b32 s6, 0 231; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 232; SI-NEXT: v_mov_b32_e32 v1, 0 233; SI-NEXT: s_waitcnt lgkmcnt(0) 234; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 235; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 236; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 237; SI-NEXT: s_waitcnt vmcnt(0) 238; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffffbf, v2 239; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 240; SI-NEXT: s_endpgm 241; 242; VI-LABEL: v_test_i32_x_sub_65: 243; VI: ; %bb.0: 244; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 245; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 246; VI-NEXT: s_waitcnt lgkmcnt(0) 247; VI-NEXT: v_mov_b32_e32 v1, s3 248; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 249; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 250; VI-NEXT: flat_load_dword v0, v[0:1] 251; VI-NEXT: v_mov_b32_e32 v3, s1 252; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 253; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 254; VI-NEXT: s_waitcnt vmcnt(0) 255; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffbf, v0 256; VI-NEXT: flat_store_dword v[2:3], v0 257; VI-NEXT: s_endpgm 258; 259; GFX9-LABEL: v_test_i32_x_sub_65: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 262; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 263; GFX9-NEXT: s_waitcnt lgkmcnt(0) 264; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 265; GFX9-NEXT: s_waitcnt vmcnt(0) 266; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1 267; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 268; GFX9-NEXT: s_endpgm 269; 270; GFX10-LABEL: v_test_i32_x_sub_65: 271; GFX10: ; %bb.0: 272; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 273; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 274; GFX10-NEXT: s_waitcnt lgkmcnt(0) 275; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 276; GFX10-NEXT: s_waitcnt vmcnt(0) 277; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 278; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 279; GFX10-NEXT: s_endpgm 280 %tid = call i32 @llvm.amdgcn.workitem.id.x() 281 %tid.ext = sext i32 %tid to i64 282 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 283 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 284 %x = load i32, i32 addrspace(1)* %gep 285 %result = sub i32 %x, 65 286 store i32 %result, i32 addrspace(1)* %gep.out 287 ret void 288} 289 290define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 291; SI-LABEL: v_test_i32_65_sub_x: 292; SI: ; %bb.0: 293; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 294; SI-NEXT: s_mov_b32 s7, 0xf000 295; SI-NEXT: s_mov_b32 s6, 0 296; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 297; SI-NEXT: v_mov_b32_e32 v1, 0 298; SI-NEXT: s_waitcnt lgkmcnt(0) 299; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 300; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 301; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 302; SI-NEXT: s_waitcnt vmcnt(0) 303; SI-NEXT: v_sub_i32_e32 v2, vcc, 0x41, v2 304; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 305; SI-NEXT: s_endpgm 306; 307; VI-LABEL: v_test_i32_65_sub_x: 308; VI: ; %bb.0: 309; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 310; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 311; VI-NEXT: s_waitcnt lgkmcnt(0) 312; VI-NEXT: v_mov_b32_e32 v1, s3 313; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 314; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 315; VI-NEXT: flat_load_dword v0, v[0:1] 316; VI-NEXT: v_mov_b32_e32 v3, s1 317; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 318; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 319; VI-NEXT: s_waitcnt vmcnt(0) 320; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41, v0 321; VI-NEXT: flat_store_dword v[2:3], v0 322; VI-NEXT: s_endpgm 323; 324; GFX9-LABEL: v_test_i32_65_sub_x: 325; GFX9: ; %bb.0: 326; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 327; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 329; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 330; GFX9-NEXT: s_waitcnt vmcnt(0) 331; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1 332; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 333; GFX9-NEXT: s_endpgm 334; 335; GFX10-LABEL: v_test_i32_65_sub_x: 336; GFX10: ; %bb.0: 337; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 338; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 339; GFX10-NEXT: s_waitcnt lgkmcnt(0) 340; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 341; GFX10-NEXT: s_waitcnt vmcnt(0) 342; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1 343; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 344; GFX10-NEXT: s_endpgm 345 %tid = call i32 @llvm.amdgcn.workitem.id.x() 346 %tid.ext = sext i32 %tid to i64 347 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 348 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 349 %x = load i32, i32 addrspace(1)* %gep 350 %result = sub i32 65, %x 351 store i32 %result, i32 addrspace(1)* %gep.out 352 ret void 353} 354 355define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 356; SI-LABEL: v_test_i32_x_sub_neg16: 357; SI: ; %bb.0: 358; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 359; SI-NEXT: s_mov_b32 s7, 0xf000 360; SI-NEXT: s_mov_b32 s6, 0 361; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 362; SI-NEXT: v_mov_b32_e32 v1, 0 363; SI-NEXT: s_waitcnt lgkmcnt(0) 364; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 365; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 366; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 367; SI-NEXT: s_waitcnt vmcnt(0) 368; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v2 369; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 370; SI-NEXT: s_endpgm 371; 372; VI-LABEL: v_test_i32_x_sub_neg16: 373; VI: ; %bb.0: 374; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 375; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 376; VI-NEXT: s_waitcnt lgkmcnt(0) 377; VI-NEXT: v_mov_b32_e32 v1, s3 378; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 379; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 380; VI-NEXT: flat_load_dword v0, v[0:1] 381; VI-NEXT: v_mov_b32_e32 v3, s1 382; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 383; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 384; VI-NEXT: s_waitcnt vmcnt(0) 385; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 386; VI-NEXT: flat_store_dword v[2:3], v0 387; VI-NEXT: s_endpgm 388; 389; GFX9-LABEL: v_test_i32_x_sub_neg16: 390; GFX9: ; %bb.0: 391; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 392; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 393; GFX9-NEXT: s_waitcnt lgkmcnt(0) 394; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 395; GFX9-NEXT: s_waitcnt vmcnt(0) 396; GFX9-NEXT: v_add_u32_e32 v1, 16, v1 397; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 398; GFX9-NEXT: s_endpgm 399; 400; GFX10-LABEL: v_test_i32_x_sub_neg16: 401; GFX10: ; %bb.0: 402; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 403; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 404; GFX10-NEXT: s_waitcnt lgkmcnt(0) 405; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 406; GFX10-NEXT: s_waitcnt vmcnt(0) 407; GFX10-NEXT: v_add_nc_u32_e32 v1, 16, v1 408; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 409; GFX10-NEXT: s_endpgm 410 %tid = call i32 @llvm.amdgcn.workitem.id.x() 411 %tid.ext = sext i32 %tid to i64 412 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 413 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 414 %x = load i32, i32 addrspace(1)* %gep 415 %result = sub i32 %x, -16 416 store i32 %result, i32 addrspace(1)* %gep.out 417 ret void 418} 419 420define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 421; SI-LABEL: v_test_i32_neg16_sub_x: 422; SI: ; %bb.0: 423; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 424; SI-NEXT: s_mov_b32 s7, 0xf000 425; SI-NEXT: s_mov_b32 s6, 0 426; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 427; SI-NEXT: v_mov_b32_e32 v1, 0 428; SI-NEXT: s_waitcnt lgkmcnt(0) 429; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 430; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 431; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 432; SI-NEXT: s_waitcnt vmcnt(0) 433; SI-NEXT: v_sub_i32_e32 v2, vcc, -16, v2 434; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 435; SI-NEXT: s_endpgm 436; 437; VI-LABEL: v_test_i32_neg16_sub_x: 438; VI: ; %bb.0: 439; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 440; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 441; VI-NEXT: s_waitcnt lgkmcnt(0) 442; VI-NEXT: v_mov_b32_e32 v1, s3 443; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 444; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 445; VI-NEXT: flat_load_dword v0, v[0:1] 446; VI-NEXT: v_mov_b32_e32 v3, s1 447; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 448; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 449; VI-NEXT: s_waitcnt vmcnt(0) 450; VI-NEXT: v_sub_u32_e32 v0, vcc, -16, v0 451; VI-NEXT: flat_store_dword v[2:3], v0 452; VI-NEXT: s_endpgm 453; 454; GFX9-LABEL: v_test_i32_neg16_sub_x: 455; GFX9: ; %bb.0: 456; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 457; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 458; GFX9-NEXT: s_waitcnt lgkmcnt(0) 459; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 460; GFX9-NEXT: s_waitcnt vmcnt(0) 461; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1 462; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 463; GFX9-NEXT: s_endpgm 464; 465; GFX10-LABEL: v_test_i32_neg16_sub_x: 466; GFX10: ; %bb.0: 467; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 468; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 469; GFX10-NEXT: s_waitcnt lgkmcnt(0) 470; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 471; GFX10-NEXT: s_waitcnt vmcnt(0) 472; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1 473; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 474; GFX10-NEXT: s_endpgm 475 %tid = call i32 @llvm.amdgcn.workitem.id.x() 476 %tid.ext = sext i32 %tid to i64 477 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 478 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 479 %x = load i32, i32 addrspace(1)* %gep 480 %result = sub i32 -16, %x 481 store i32 %result, i32 addrspace(1)* %gep.out 482 ret void 483} 484 485define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 486; SI-LABEL: v_test_i32_x_sub_neg17: 487; SI: ; %bb.0: 488; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 489; SI-NEXT: s_mov_b32 s7, 0xf000 490; SI-NEXT: s_mov_b32 s6, 0 491; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 492; SI-NEXT: v_mov_b32_e32 v1, 0 493; SI-NEXT: s_waitcnt lgkmcnt(0) 494; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 495; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 496; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 497; SI-NEXT: s_waitcnt vmcnt(0) 498; SI-NEXT: v_add_i32_e32 v2, vcc, 17, v2 499; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 500; SI-NEXT: s_endpgm 501; 502; VI-LABEL: v_test_i32_x_sub_neg17: 503; VI: ; %bb.0: 504; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 505; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 506; VI-NEXT: s_waitcnt lgkmcnt(0) 507; VI-NEXT: v_mov_b32_e32 v1, s3 508; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 509; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 510; VI-NEXT: flat_load_dword v0, v[0:1] 511; VI-NEXT: v_mov_b32_e32 v3, s1 512; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 513; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 514; VI-NEXT: s_waitcnt vmcnt(0) 515; VI-NEXT: v_add_u32_e32 v0, vcc, 17, v0 516; VI-NEXT: flat_store_dword v[2:3], v0 517; VI-NEXT: s_endpgm 518; 519; GFX9-LABEL: v_test_i32_x_sub_neg17: 520; GFX9: ; %bb.0: 521; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 522; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 523; GFX9-NEXT: s_waitcnt lgkmcnt(0) 524; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 525; GFX9-NEXT: s_waitcnt vmcnt(0) 526; GFX9-NEXT: v_add_u32_e32 v1, 17, v1 527; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 528; GFX9-NEXT: s_endpgm 529; 530; GFX10-LABEL: v_test_i32_x_sub_neg17: 531; GFX10: ; %bb.0: 532; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 533; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 534; GFX10-NEXT: s_waitcnt lgkmcnt(0) 535; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 536; GFX10-NEXT: s_waitcnt vmcnt(0) 537; GFX10-NEXT: v_add_nc_u32_e32 v1, 17, v1 538; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 539; GFX10-NEXT: s_endpgm 540 %tid = call i32 @llvm.amdgcn.workitem.id.x() 541 %tid.ext = sext i32 %tid to i64 542 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 543 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 544 %x = load i32, i32 addrspace(1)* %gep 545 %result = sub i32 %x, -17 546 store i32 %result, i32 addrspace(1)* %gep.out 547 ret void 548} 549 550define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 551; SI-LABEL: v_test_i32_neg17_sub_x: 552; SI: ; %bb.0: 553; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 554; SI-NEXT: s_mov_b32 s7, 0xf000 555; SI-NEXT: s_mov_b32 s6, 0 556; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 557; SI-NEXT: v_mov_b32_e32 v1, 0 558; SI-NEXT: s_waitcnt lgkmcnt(0) 559; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 560; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 561; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 562; SI-NEXT: s_waitcnt vmcnt(0) 563; SI-NEXT: v_sub_i32_e32 v2, vcc, 0xffffffef, v2 564; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 565; SI-NEXT: s_endpgm 566; 567; VI-LABEL: v_test_i32_neg17_sub_x: 568; VI: ; %bb.0: 569; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 570; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 571; VI-NEXT: s_waitcnt lgkmcnt(0) 572; VI-NEXT: v_mov_b32_e32 v1, s3 573; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 574; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 575; VI-NEXT: flat_load_dword v0, v[0:1] 576; VI-NEXT: v_mov_b32_e32 v3, s1 577; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 578; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 579; VI-NEXT: s_waitcnt vmcnt(0) 580; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xffffffef, v0 581; VI-NEXT: flat_store_dword v[2:3], v0 582; VI-NEXT: s_endpgm 583; 584; GFX9-LABEL: v_test_i32_neg17_sub_x: 585; GFX9: ; %bb.0: 586; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 587; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 588; GFX9-NEXT: s_waitcnt lgkmcnt(0) 589; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 590; GFX9-NEXT: s_waitcnt vmcnt(0) 591; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1 592; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 593; GFX9-NEXT: s_endpgm 594; 595; GFX10-LABEL: v_test_i32_neg17_sub_x: 596; GFX10: ; %bb.0: 597; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 598; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 599; GFX10-NEXT: s_waitcnt lgkmcnt(0) 600; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 601; GFX10-NEXT: s_waitcnt vmcnt(0) 602; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1 603; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 604; GFX10-NEXT: s_endpgm 605 %tid = call i32 @llvm.amdgcn.workitem.id.x() 606 %tid.ext = sext i32 %tid to i64 607 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext 608 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 609 %x = load i32, i32 addrspace(1)* %gep 610 %result = sub i32 -17, %x 611 store i32 %result, i32 addrspace(1)* %gep.out 612 ret void 613} 614 615define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { 616; SI-LABEL: s_test_i32_x_sub_64: 617; SI: ; %bb.0: 618; SI-NEXT: s_load_dword s0, s[0:1], 0x9 619; SI-NEXT: s_waitcnt lgkmcnt(0) 620; SI-NEXT: s_sub_i32 s0, s0, 64 621; SI-NEXT: ;;#ASMSTART 622; SI-NEXT: ; use s0 623; SI-NEXT: ;;#ASMEND 624; SI-NEXT: s_endpgm 625; 626; VI-LABEL: s_test_i32_x_sub_64: 627; VI: ; %bb.0: 628; VI-NEXT: s_load_dword s0, s[0:1], 0x24 629; VI-NEXT: s_waitcnt lgkmcnt(0) 630; VI-NEXT: s_sub_i32 s0, s0, 64 631; VI-NEXT: ;;#ASMSTART 632; VI-NEXT: ; use s0 633; VI-NEXT: ;;#ASMEND 634; VI-NEXT: s_endpgm 635; 636; GFX9-LABEL: s_test_i32_x_sub_64: 637; GFX9: ; %bb.0: 638; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 639; GFX9-NEXT: s_waitcnt lgkmcnt(0) 640; GFX9-NEXT: s_sub_i32 s0, s0, 64 641; GFX9-NEXT: ;;#ASMSTART 642; GFX9-NEXT: ; use s0 643; GFX9-NEXT: ;;#ASMEND 644; GFX9-NEXT: s_endpgm 645; 646; GFX10-LABEL: s_test_i32_x_sub_64: 647; GFX10: ; %bb.0: 648; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 649; GFX10-NEXT: s_waitcnt lgkmcnt(0) 650; GFX10-NEXT: s_sub_i32 s0, s0, 64 651; GFX10-NEXT: ;;#ASMSTART 652; GFX10-NEXT: ; use s0 653; GFX10-NEXT: ;;#ASMEND 654; GFX10-NEXT: s_endpgm 655 %result = sub i32 %x, 64 656 call void asm sideeffect "; use $0", "s"(i32 %result) 657 ret void 658} 659 660define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 661; SI-LABEL: v_test_i16_x_sub_64: 662; SI: ; %bb.0: 663; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 664; SI-NEXT: s_mov_b32 s7, 0xf000 665; SI-NEXT: s_mov_b32 s6, 0 666; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 667; SI-NEXT: v_mov_b32_e32 v1, 0 668; SI-NEXT: s_waitcnt lgkmcnt(0) 669; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 670; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 671; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 672; SI-NEXT: s_waitcnt vmcnt(0) 673; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 674; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 675; SI-NEXT: s_endpgm 676; 677; VI-LABEL: v_test_i16_x_sub_64: 678; VI: ; %bb.0: 679; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 680; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 681; VI-NEXT: s_waitcnt lgkmcnt(0) 682; VI-NEXT: v_mov_b32_e32 v1, s3 683; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 684; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 685; VI-NEXT: flat_load_ushort v0, v[0:1] 686; VI-NEXT: v_mov_b32_e32 v3, s1 687; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 688; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 689; VI-NEXT: s_waitcnt vmcnt(0) 690; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 691; VI-NEXT: flat_store_short v[2:3], v0 692; VI-NEXT: s_endpgm 693; 694; GFX9-LABEL: v_test_i16_x_sub_64: 695; GFX9: ; %bb.0: 696; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 697; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 698; GFX9-NEXT: s_waitcnt lgkmcnt(0) 699; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 700; GFX9-NEXT: s_waitcnt vmcnt(0) 701; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 702; GFX9-NEXT: global_store_short v0, v1, s[0:1] 703; GFX9-NEXT: s_endpgm 704; 705; GFX10-LABEL: v_test_i16_x_sub_64: 706; GFX10: ; %bb.0: 707; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 708; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 709; GFX10-NEXT: s_waitcnt lgkmcnt(0) 710; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 711; GFX10-NEXT: s_waitcnt vmcnt(0) 712; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64 713; GFX10-NEXT: global_store_short v0, v1, s[0:1] 714; GFX10-NEXT: s_endpgm 715 %tid = call i32 @llvm.amdgcn.workitem.id.x() 716 %tid.ext = sext i32 %tid to i64 717 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext 718 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 719 %x = load i16, i16 addrspace(1)* %gep 720 %result = sub i16 %x, 64 721 store i16 %result, i16 addrspace(1)* %gep.out 722 ret void 723} 724 725define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 726; SI-LABEL: v_test_i16_x_sub_64_zext_to_i32: 727; SI: ; %bb.0: 728; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 729; SI-NEXT: s_mov_b32 s7, 0xf000 730; SI-NEXT: s_mov_b32 s6, 0 731; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 732; SI-NEXT: v_mov_b32_e32 v2, 0 733; SI-NEXT: s_waitcnt lgkmcnt(0) 734; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 735; SI-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 736; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 737; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 738; SI-NEXT: s_waitcnt vmcnt(0) 739; SI-NEXT: v_subrev_i32_e32 v0, vcc, 64, v3 740; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 741; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 742; SI-NEXT: s_endpgm 743; 744; VI-LABEL: v_test_i16_x_sub_64_zext_to_i32: 745; VI: ; %bb.0: 746; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 747; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 748; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 749; VI-NEXT: s_waitcnt lgkmcnt(0) 750; VI-NEXT: v_mov_b32_e32 v2, s3 751; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 752; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 753; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 754; VI-NEXT: flat_load_ushort v0, v[1:2] 755; VI-NEXT: v_mov_b32_e32 v4, s1 756; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 757; VI-NEXT: s_waitcnt vmcnt(0) 758; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 759; VI-NEXT: flat_store_dword v[3:4], v0 760; VI-NEXT: s_endpgm 761; 762; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: 763; GFX9: ; %bb.0: 764; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 765; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 766; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 767; GFX9-NEXT: s_waitcnt lgkmcnt(0) 768; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] 769; GFX9-NEXT: s_waitcnt vmcnt(0) 770; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 771; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 772; GFX9-NEXT: s_endpgm 773; 774; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: 775; GFX10: ; %bb.0: 776; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 777; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 778; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 779; GFX10-NEXT: s_waitcnt lgkmcnt(0) 780; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] 781; GFX10-NEXT: s_waitcnt vmcnt(0) 782; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64 783; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 784; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 785; GFX10-NEXT: s_endpgm 786 %tid = call i32 @llvm.amdgcn.workitem.id.x() 787 %tid.ext = sext i32 %tid to i64 788 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext 789 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext 790 %x = load i16, i16 addrspace(1)* %gep 791 %result = sub i16 %x, 64 792 %zext = zext i16 %result to i32 793 store i32 %zext, i32 addrspace(1)* %gep.out 794 ret void 795} 796 797define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 798; SI-LABEL: v_test_i16_x_sub_64_multi_use: 799; SI: ; %bb.0: 800; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 801; SI-NEXT: s_mov_b32 s7, 0xf000 802; SI-NEXT: s_mov_b32 s6, 0 803; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 804; SI-NEXT: v_mov_b32_e32 v1, 0 805; SI-NEXT: s_waitcnt lgkmcnt(0) 806; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 807; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 808; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 809; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 810; SI-NEXT: s_waitcnt vmcnt(1) 811; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 812; SI-NEXT: s_waitcnt vmcnt(0) 813; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 814; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 815; SI-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64 816; SI-NEXT: s_endpgm 817; 818; VI-LABEL: v_test_i16_x_sub_64_multi_use: 819; VI: ; %bb.0: 820; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 821; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 822; VI-NEXT: s_waitcnt lgkmcnt(0) 823; VI-NEXT: v_mov_b32_e32 v1, s3 824; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 825; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 826; VI-NEXT: flat_load_ushort v3, v[0:1] 827; VI-NEXT: flat_load_ushort v4, v[0:1] 828; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 829; VI-NEXT: v_mov_b32_e32 v1, s1 830; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 831; VI-NEXT: s_waitcnt vmcnt(1) 832; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 833; VI-NEXT: s_waitcnt vmcnt(0) 834; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 835; VI-NEXT: flat_store_short v[0:1], v2 836; VI-NEXT: flat_store_short v[0:1], v3 837; VI-NEXT: s_endpgm 838; 839; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: 840; GFX9: ; %bb.0: 841; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 842; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 843; GFX9-NEXT: s_waitcnt lgkmcnt(0) 844; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 845; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] 846; GFX9-NEXT: s_waitcnt vmcnt(1) 847; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 848; GFX9-NEXT: s_waitcnt vmcnt(0) 849; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2 850; GFX9-NEXT: global_store_short v0, v1, s[0:1] 851; GFX9-NEXT: global_store_short v0, v2, s[0:1] 852; GFX9-NEXT: s_endpgm 853; 854; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: 855; GFX10: ; %bb.0: 856; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 857; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 858; GFX10-NEXT: s_waitcnt lgkmcnt(0) 859; GFX10-NEXT: s_clause 0x1 860; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 861; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] 862; GFX10-NEXT: s_waitcnt vmcnt(1) 863; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64 864; GFX10-NEXT: s_waitcnt vmcnt(0) 865; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 64 866; GFX10-NEXT: global_store_short v0, v1, s[0:1] 867; GFX10-NEXT: global_store_short v0, v2, s[0:1] 868; GFX10-NEXT: s_endpgm 869 %tid = call i32 @llvm.amdgcn.workitem.id.x() 870 %tid.ext = sext i32 %tid to i64 871 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext 872 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext 873 %x = load volatile i16, i16 addrspace(1)* %gep 874 %y = load volatile i16, i16 addrspace(1)* %gep 875 %result0 = sub i16 %x, 64 876 %result1 = sub i16 %y, 64 877 store volatile i16 %result0, i16 addrspace(1)* %gep.out 878 store volatile i16 %result1, i16 addrspace(1)* %gep.out 879 ret void 880} 881 882define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 883; SI-LABEL: v_test_v2i16_x_sub_64_64: 884; SI: ; %bb.0: 885; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 886; SI-NEXT: s_mov_b32 s7, 0xf000 887; SI-NEXT: s_mov_b32 s6, 0 888; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 889; SI-NEXT: v_mov_b32_e32 v1, 0 890; SI-NEXT: s_waitcnt lgkmcnt(0) 891; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 892; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 893; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 894; SI-NEXT: s_waitcnt vmcnt(0) 895; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2 896; SI-NEXT: s_mov_b32 s4, 0xffff0000 897; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 898; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2 899; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 900; SI-NEXT: s_endpgm 901; 902; VI-LABEL: v_test_v2i16_x_sub_64_64: 903; VI: ; %bb.0: 904; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 905; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 906; VI-NEXT: v_mov_b32_e32 v4, 64 907; VI-NEXT: s_waitcnt lgkmcnt(0) 908; VI-NEXT: v_mov_b32_e32 v1, s3 909; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 910; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 911; VI-NEXT: flat_load_dword v3, v[0:1] 912; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 913; VI-NEXT: v_mov_b32_e32 v1, s1 914; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 915; VI-NEXT: s_waitcnt vmcnt(0) 916; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 917; VI-NEXT: v_subrev_u16_e32 v3, 64, v3 918; VI-NEXT: v_or_b32_e32 v2, v3, v2 919; VI-NEXT: flat_store_dword v[0:1], v2 920; VI-NEXT: s_endpgm 921; 922; GFX9-LABEL: v_test_v2i16_x_sub_64_64: 923; GFX9: ; %bb.0: 924; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 925; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 926; GFX9-NEXT: s_waitcnt lgkmcnt(0) 927; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 928; GFX9-NEXT: s_waitcnt vmcnt(0) 929; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] 930; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 931; GFX9-NEXT: s_endpgm 932; 933; GFX10-LABEL: v_test_v2i16_x_sub_64_64: 934; GFX10: ; %bb.0: 935; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 936; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 937; GFX10-NEXT: s_waitcnt lgkmcnt(0) 938; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 939; GFX10-NEXT: s_waitcnt vmcnt(0) 940; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] 941; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 942; GFX10-NEXT: s_endpgm 943 %tid = call i32 @llvm.amdgcn.workitem.id.x() 944 %tid.ext = sext i32 %tid to i64 945 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 946 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 947 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 948 %result = sub <2 x i16> %x, <i16 64, i16 64> 949 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 950 ret void 951} 952 953define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 954; SI-LABEL: v_test_v2i16_x_sub_7_64: 955; SI: ; %bb.0: 956; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 957; SI-NEXT: s_mov_b32 s7, 0xf000 958; SI-NEXT: s_mov_b32 s6, 0 959; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 960; SI-NEXT: v_mov_b32_e32 v1, 0 961; SI-NEXT: s_waitcnt lgkmcnt(0) 962; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 963; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 964; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 965; SI-NEXT: s_waitcnt vmcnt(0) 966; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2 967; SI-NEXT: s_mov_b32 s4, 0xffff0000 968; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 969; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2 970; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 971; SI-NEXT: s_endpgm 972; 973; VI-LABEL: v_test_v2i16_x_sub_7_64: 974; VI: ; %bb.0: 975; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 976; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 977; VI-NEXT: v_mov_b32_e32 v4, 64 978; VI-NEXT: s_waitcnt lgkmcnt(0) 979; VI-NEXT: v_mov_b32_e32 v1, s3 980; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 981; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 982; VI-NEXT: flat_load_dword v3, v[0:1] 983; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 984; VI-NEXT: v_mov_b32_e32 v1, s1 985; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 986; VI-NEXT: s_waitcnt vmcnt(0) 987; VI-NEXT: v_add_u16_e32 v2, -7, v3 988; VI-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 989; VI-NEXT: v_or_b32_e32 v2, v2, v3 990; VI-NEXT: flat_store_dword v[0:1], v2 991; VI-NEXT: s_endpgm 992; 993; GFX9-LABEL: v_test_v2i16_x_sub_7_64: 994; GFX9: ; %bb.0: 995; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 996; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 998; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 999; GFX9-NEXT: s_mov_b32 s2, 0x400007 1000; GFX9-NEXT: s_waitcnt vmcnt(0) 1001; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2 1002; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1003; GFX9-NEXT: s_endpgm 1004; 1005; GFX10-LABEL: v_test_v2i16_x_sub_7_64: 1006; GFX10: ; %bb.0: 1007; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1008; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1009; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1010; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1011; GFX10-NEXT: s_waitcnt vmcnt(0) 1012; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007 1013; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1014; GFX10-NEXT: s_endpgm 1015 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1016 %tid.ext = sext i32 %tid to i64 1017 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1018 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1019 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1020 %result = sub <2 x i16> %x, <i16 7, i16 64> 1021 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1022 ret void 1023} 1024 1025define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1026; SI-LABEL: v_test_v2i16_x_sub_64_123: 1027; SI: ; %bb.0: 1028; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1029; SI-NEXT: s_mov_b32 s7, 0xf000 1030; SI-NEXT: s_mov_b32 s6, 0 1031; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1032; SI-NEXT: v_mov_b32_e32 v1, 0 1033; SI-NEXT: s_waitcnt lgkmcnt(0) 1034; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1035; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1036; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1037; SI-NEXT: s_waitcnt vmcnt(0) 1038; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2 1039; SI-NEXT: s_mov_b32 s4, 0xffff0000 1040; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 1041; SI-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2 1042; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1043; SI-NEXT: s_endpgm 1044; 1045; VI-LABEL: v_test_v2i16_x_sub_64_123: 1046; VI: ; %bb.0: 1047; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1048; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1049; VI-NEXT: v_mov_b32_e32 v4, 0xffffff85 1050; VI-NEXT: s_waitcnt lgkmcnt(0) 1051; VI-NEXT: v_mov_b32_e32 v1, s3 1052; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1053; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1054; VI-NEXT: flat_load_dword v3, v[0:1] 1055; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1056; VI-NEXT: v_mov_b32_e32 v1, s1 1057; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1058; VI-NEXT: s_waitcnt vmcnt(0) 1059; VI-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1060; VI-NEXT: v_subrev_u16_e32 v3, 64, v3 1061; VI-NEXT: v_or_b32_e32 v2, v3, v2 1062; VI-NEXT: flat_store_dword v[0:1], v2 1063; VI-NEXT: s_endpgm 1064; 1065; GFX9-LABEL: v_test_v2i16_x_sub_64_123: 1066; GFX9: ; %bb.0: 1067; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1068; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1069; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1071; GFX9-NEXT: s_mov_b32 s2, 0x7b0040 1072; GFX9-NEXT: s_waitcnt vmcnt(0) 1073; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2 1074; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1075; GFX9-NEXT: s_endpgm 1076; 1077; GFX10-LABEL: v_test_v2i16_x_sub_64_123: 1078; GFX10: ; %bb.0: 1079; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1080; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1081; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1083; GFX10-NEXT: s_waitcnt vmcnt(0) 1084; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040 1085; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1086; GFX10-NEXT: s_endpgm 1087 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1088 %tid.ext = sext i32 %tid to i64 1089 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1090 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1091 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1092 %result = sub <2 x i16> %x, <i16 64, i16 123> 1093 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1094 ret void 1095} 1096 1097; Can fold 0 and inline immediate in other half. 1098define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1099; SI-LABEL: v_test_v2i16_x_sub_7_0: 1100; SI: ; %bb.0: 1101; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1102; SI-NEXT: s_mov_b32 s7, 0xf000 1103; SI-NEXT: s_mov_b32 s6, 0 1104; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1105; SI-NEXT: v_mov_b32_e32 v1, 0 1106; SI-NEXT: s_waitcnt lgkmcnt(0) 1107; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1108; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1109; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1110; SI-NEXT: s_waitcnt vmcnt(0) 1111; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2 1112; SI-NEXT: s_mov_b32 s4, 0xffff 1113; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 1114; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1115; SI-NEXT: s_endpgm 1116; 1117; VI-LABEL: v_test_v2i16_x_sub_7_0: 1118; VI: ; %bb.0: 1119; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1120; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1121; VI-NEXT: s_waitcnt lgkmcnt(0) 1122; VI-NEXT: v_mov_b32_e32 v1, s3 1123; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1124; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1125; VI-NEXT: flat_load_dword v3, v[0:1] 1126; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1127; VI-NEXT: v_mov_b32_e32 v1, s1 1128; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1129; VI-NEXT: s_waitcnt vmcnt(0) 1130; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1131; VI-NEXT: v_add_u16_e32 v3, -7, v3 1132; VI-NEXT: v_or_b32_e32 v2, v3, v2 1133; VI-NEXT: flat_store_dword v[0:1], v2 1134; VI-NEXT: s_endpgm 1135; 1136; GFX9-LABEL: v_test_v2i16_x_sub_7_0: 1137; GFX9: ; %bb.0: 1138; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1139; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1140; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1142; GFX9-NEXT: s_waitcnt vmcnt(0) 1143; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7 1144; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1145; GFX9-NEXT: s_endpgm 1146; 1147; GFX10-LABEL: v_test_v2i16_x_sub_7_0: 1148; GFX10: ; %bb.0: 1149; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1150; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1151; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1152; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1153; GFX10-NEXT: s_waitcnt vmcnt(0) 1154; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7 1155; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1156; GFX10-NEXT: s_endpgm 1157 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1158 %tid.ext = sext i32 %tid to i64 1159 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1160 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1161 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1162 %result = sub <2 x i16> %x, <i16 7, i16 0> 1163 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1164 ret void 1165} 1166 1167; Can fold 0 and inline immediate in other half. 1168define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1169; SI-LABEL: v_test_v2i16_x_sub_0_16: 1170; SI: ; %bb.0: 1171; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1172; SI-NEXT: s_mov_b32 s7, 0xf000 1173; SI-NEXT: s_mov_b32 s6, 0 1174; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1175; SI-NEXT: v_mov_b32_e32 v1, 0 1176; SI-NEXT: s_waitcnt lgkmcnt(0) 1177; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1178; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1179; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1180; SI-NEXT: s_waitcnt vmcnt(0) 1181; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2 1182; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1183; SI-NEXT: s_endpgm 1184; 1185; VI-LABEL: v_test_v2i16_x_sub_0_16: 1186; VI: ; %bb.0: 1187; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1188; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1189; VI-NEXT: s_waitcnt lgkmcnt(0) 1190; VI-NEXT: v_mov_b32_e32 v1, s3 1191; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1192; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1193; VI-NEXT: flat_load_dword v0, v[0:1] 1194; VI-NEXT: v_mov_b32_e32 v1, -16 1195; VI-NEXT: v_mov_b32_e32 v3, s1 1196; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1197; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1198; VI-NEXT: s_waitcnt vmcnt(0) 1199; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1200; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1201; VI-NEXT: flat_store_dword v[2:3], v0 1202; VI-NEXT: s_endpgm 1203; 1204; GFX9-LABEL: v_test_v2i16_x_sub_0_16: 1205; GFX9: ; %bb.0: 1206; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1207; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1208; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1209; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1210; GFX9-NEXT: s_waitcnt vmcnt(0) 1211; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] 1212; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1213; GFX9-NEXT: s_endpgm 1214; 1215; GFX10-LABEL: v_test_v2i16_x_sub_0_16: 1216; GFX10: ; %bb.0: 1217; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1218; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1219; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1220; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1221; GFX10-NEXT: s_waitcnt vmcnt(0) 1222; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] 1223; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1224; GFX10-NEXT: s_endpgm 1225 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1226 %tid.ext = sext i32 %tid to i64 1227 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1228 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1229 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1230 %result = sub <2 x i16> %x, <i16 0, i16 16> 1231 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1232 ret void 1233} 1234 1235define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1236; SI-LABEL: v_test_v2i16_x_sub_0_1_0: 1237; SI: ; %bb.0: 1238; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1239; SI-NEXT: s_mov_b32 s7, 0xf000 1240; SI-NEXT: s_mov_b32 s6, 0 1241; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1242; SI-NEXT: v_mov_b32_e32 v1, 0 1243; SI-NEXT: s_waitcnt lgkmcnt(0) 1244; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1245; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1246; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1247; SI-NEXT: s_waitcnt vmcnt(0) 1248; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3c000000, v2 1249; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1250; SI-NEXT: s_endpgm 1251; 1252; VI-LABEL: v_test_v2i16_x_sub_0_1_0: 1253; VI: ; %bb.0: 1254; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1255; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1256; VI-NEXT: s_waitcnt lgkmcnt(0) 1257; VI-NEXT: v_mov_b32_e32 v1, s3 1258; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1259; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1260; VI-NEXT: flat_load_dword v0, v[0:1] 1261; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 1262; VI-NEXT: v_mov_b32_e32 v3, s1 1263; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1264; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1265; VI-NEXT: s_waitcnt vmcnt(0) 1266; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1267; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1268; VI-NEXT: flat_store_dword v[2:3], v0 1269; VI-NEXT: s_endpgm 1270; 1271; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0: 1272; GFX9: ; %bb.0: 1273; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1274; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1275; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1276; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1277; GFX9-NEXT: s_brev_b32 s2, 35 1278; GFX9-NEXT: s_waitcnt vmcnt(0) 1279; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2 1280; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1281; GFX9-NEXT: s_endpgm 1282; 1283; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0: 1284; GFX10: ; %bb.0: 1285; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1286; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1287; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1288; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1289; GFX10-NEXT: s_waitcnt vmcnt(0) 1290; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0] 1291; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1292; GFX10-NEXT: s_endpgm 1293 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1294 %tid.ext = sext i32 %tid to i64 1295 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1296 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1297 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1298 %result = sub <2 x i16> %x, <i16 0, i16 -15360> 1299 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1300 ret void 1301} 1302 1303define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1304; SI-LABEL: v_test_v2i16_x_sub_0_neg1_0: 1305; SI: ; %bb.0: 1306; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1307; SI-NEXT: s_mov_b32 s7, 0xf000 1308; SI-NEXT: s_mov_b32 s6, 0 1309; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1310; SI-NEXT: v_mov_b32_e32 v1, 0 1311; SI-NEXT: s_waitcnt lgkmcnt(0) 1312; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1313; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1314; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1315; SI-NEXT: s_waitcnt vmcnt(0) 1316; SI-NEXT: v_add_i32_e32 v2, vcc, 0xbc000000, v2 1317; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1318; SI-NEXT: s_endpgm 1319; 1320; VI-LABEL: v_test_v2i16_x_sub_0_neg1_0: 1321; VI: ; %bb.0: 1322; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1323; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1324; VI-NEXT: s_waitcnt lgkmcnt(0) 1325; VI-NEXT: v_mov_b32_e32 v1, s3 1326; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1327; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1328; VI-NEXT: flat_load_dword v0, v[0:1] 1329; VI-NEXT: v_mov_b32_e32 v1, 0xffffbc00 1330; VI-NEXT: v_mov_b32_e32 v3, s1 1331; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1332; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1333; VI-NEXT: s_waitcnt vmcnt(0) 1334; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1335; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1336; VI-NEXT: flat_store_dword v[2:3], v0 1337; VI-NEXT: s_endpgm 1338; 1339; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0: 1340; GFX9: ; %bb.0: 1341; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1342; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1343; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1344; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1345; GFX9-NEXT: s_brev_b32 s2, 34 1346; GFX9-NEXT: s_waitcnt vmcnt(0) 1347; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2 1348; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1349; GFX9-NEXT: s_endpgm 1350; 1351; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0: 1352; GFX10: ; %bb.0: 1353; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1354; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1355; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1356; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1357; GFX10-NEXT: s_waitcnt vmcnt(0) 1358; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0] 1359; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1360; GFX10-NEXT: s_endpgm 1361 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1362 %tid.ext = sext i32 %tid to i64 1363 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1364 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1365 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1366 %result = sub <2 x i16> %x, <i16 0, i16 17408> 1367 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1368 ret void 1369} 1370 1371; -32 isn't an inline immediate, but 32 is 1372define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1373; SI-LABEL: v_test_v2i16_x_add_neg32_neg32: 1374; SI: ; %bb.0: 1375; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1376; SI-NEXT: s_mov_b32 s7, 0xf000 1377; SI-NEXT: s_mov_b32 s6, 0 1378; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1379; SI-NEXT: v_mov_b32_e32 v1, 0 1380; SI-NEXT: s_waitcnt lgkmcnt(0) 1381; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1382; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1383; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1384; SI-NEXT: s_waitcnt vmcnt(0) 1385; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2 1386; SI-NEXT: s_mov_b32 s4, 0xffff0000 1387; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 1388; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2 1389; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1390; SI-NEXT: s_endpgm 1391; 1392; VI-LABEL: v_test_v2i16_x_add_neg32_neg32: 1393; VI: ; %bb.0: 1394; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1395; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1396; VI-NEXT: v_mov_b32_e32 v4, 32 1397; VI-NEXT: s_waitcnt lgkmcnt(0) 1398; VI-NEXT: v_mov_b32_e32 v1, s3 1399; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1400; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1401; VI-NEXT: flat_load_dword v3, v[0:1] 1402; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1403; VI-NEXT: v_mov_b32_e32 v1, s1 1404; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1405; VI-NEXT: s_waitcnt vmcnt(0) 1406; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1407; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 1408; VI-NEXT: v_or_b32_e32 v2, v3, v2 1409; VI-NEXT: flat_store_dword v[0:1], v2 1410; VI-NEXT: s_endpgm 1411; 1412; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: 1413; GFX9: ; %bb.0: 1414; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1415; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1416; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1418; GFX9-NEXT: s_waitcnt vmcnt(0) 1419; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] 1420; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1421; GFX9-NEXT: s_endpgm 1422; 1423; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: 1424; GFX10: ; %bb.0: 1425; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1426; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1427; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1428; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1429; GFX10-NEXT: s_waitcnt vmcnt(0) 1430; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] 1431; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1432; GFX10-NEXT: s_endpgm 1433 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1434 %tid.ext = sext i32 %tid to i64 1435 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1436 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1437 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1438 %result = add <2 x i16> %x, <i16 -32, i16 -32> 1439 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1440 ret void 1441} 1442 1443define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1444; SI-LABEL: v_test_v2i16_x_add_0_neg32: 1445; SI: ; %bb.0: 1446; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1447; SI-NEXT: s_mov_b32 s7, 0xf000 1448; SI-NEXT: s_mov_b32 s6, 0 1449; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1450; SI-NEXT: v_mov_b32_e32 v1, 0 1451; SI-NEXT: s_waitcnt lgkmcnt(0) 1452; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1453; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1454; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1455; SI-NEXT: s_waitcnt vmcnt(0) 1456; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2 1457; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1458; SI-NEXT: s_endpgm 1459; 1460; VI-LABEL: v_test_v2i16_x_add_0_neg32: 1461; VI: ; %bb.0: 1462; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1463; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1464; VI-NEXT: s_waitcnt lgkmcnt(0) 1465; VI-NEXT: v_mov_b32_e32 v1, s3 1466; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1467; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1468; VI-NEXT: flat_load_dword v0, v[0:1] 1469; VI-NEXT: v_mov_b32_e32 v1, 32 1470; VI-NEXT: v_mov_b32_e32 v3, s1 1471; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1472; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1473; VI-NEXT: s_waitcnt vmcnt(0) 1474; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1475; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1476; VI-NEXT: flat_store_dword v[2:3], v0 1477; VI-NEXT: s_endpgm 1478; 1479; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: 1480; GFX9: ; %bb.0: 1481; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1482; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1483; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1484; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1485; GFX9-NEXT: s_waitcnt vmcnt(0) 1486; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] 1487; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1488; GFX9-NEXT: s_endpgm 1489; 1490; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: 1491; GFX10: ; %bb.0: 1492; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1493; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1494; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1495; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1496; GFX10-NEXT: s_waitcnt vmcnt(0) 1497; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] 1498; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1499; GFX10-NEXT: s_endpgm 1500 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1501 %tid.ext = sext i32 %tid to i64 1502 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1503 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1504 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1505 %result = add <2 x i16> %x, <i16 0, i16 -32> 1506 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1507 ret void 1508} 1509 1510define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1511; SI-LABEL: v_test_v2i16_x_add_neg32_0: 1512; SI: ; %bb.0: 1513; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1514; SI-NEXT: s_mov_b32 s7, 0xf000 1515; SI-NEXT: s_mov_b32 s6, 0 1516; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1517; SI-NEXT: v_mov_b32_e32 v1, 0 1518; SI-NEXT: s_waitcnt lgkmcnt(0) 1519; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1520; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1521; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1522; SI-NEXT: s_waitcnt vmcnt(0) 1523; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2 1524; SI-NEXT: s_mov_b32 s4, 0xffff 1525; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 1526; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1527; SI-NEXT: s_endpgm 1528; 1529; VI-LABEL: v_test_v2i16_x_add_neg32_0: 1530; VI: ; %bb.0: 1531; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1532; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1533; VI-NEXT: s_waitcnt lgkmcnt(0) 1534; VI-NEXT: v_mov_b32_e32 v1, s3 1535; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1536; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1537; VI-NEXT: flat_load_dword v3, v[0:1] 1538; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1539; VI-NEXT: v_mov_b32_e32 v1, s1 1540; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1541; VI-NEXT: s_waitcnt vmcnt(0) 1542; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1543; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 1544; VI-NEXT: v_or_b32_e32 v2, v3, v2 1545; VI-NEXT: flat_store_dword v[0:1], v2 1546; VI-NEXT: s_endpgm 1547; 1548; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: 1549; GFX9: ; %bb.0: 1550; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1551; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1552; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1553; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1554; GFX9-NEXT: s_waitcnt vmcnt(0) 1555; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 1556; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1557; GFX9-NEXT: s_endpgm 1558; 1559; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: 1560; GFX10: ; %bb.0: 1561; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1562; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1563; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1564; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1565; GFX10-NEXT: s_waitcnt vmcnt(0) 1566; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 1567; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1568; GFX10-NEXT: s_endpgm 1569 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1570 %tid.ext = sext i32 %tid to i64 1571 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1572 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1573 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1574 %result = add <2 x i16> %x, <i16 -32, i16 0> 1575 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1576 ret void 1577} 1578 1579; 16 and -16 are both inline immediates 1580define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1581; SI-LABEL: v_test_v2i16_x_add_neg16_neg16: 1582; SI: ; %bb.0: 1583; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1584; SI-NEXT: s_mov_b32 s7, 0xf000 1585; SI-NEXT: s_mov_b32 s6, 0 1586; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1587; SI-NEXT: v_mov_b32_e32 v1, 0 1588; SI-NEXT: s_waitcnt lgkmcnt(0) 1589; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1590; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1591; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1592; SI-NEXT: s_waitcnt vmcnt(0) 1593; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2 1594; SI-NEXT: s_mov_b32 s4, 0xffff0000 1595; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 1596; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2 1597; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1598; SI-NEXT: s_endpgm 1599; 1600; VI-LABEL: v_test_v2i16_x_add_neg16_neg16: 1601; VI: ; %bb.0: 1602; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1603; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1604; VI-NEXT: v_mov_b32_e32 v4, -16 1605; VI-NEXT: s_waitcnt lgkmcnt(0) 1606; VI-NEXT: v_mov_b32_e32 v1, s3 1607; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1608; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1609; VI-NEXT: flat_load_dword v3, v[0:1] 1610; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1611; VI-NEXT: v_mov_b32_e32 v1, s1 1612; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1613; VI-NEXT: s_waitcnt vmcnt(0) 1614; VI-NEXT: v_add_u16_e32 v2, -16, v3 1615; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1616; VI-NEXT: v_or_b32_e32 v2, v2, v3 1617; VI-NEXT: flat_store_dword v[0:1], v2 1618; VI-NEXT: s_endpgm 1619; 1620; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: 1621; GFX9: ; %bb.0: 1622; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1623; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1624; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1625; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1626; GFX9-NEXT: s_waitcnt vmcnt(0) 1627; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] 1628; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1629; GFX9-NEXT: s_endpgm 1630; 1631; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: 1632; GFX10: ; %bb.0: 1633; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1634; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1635; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1637; GFX10-NEXT: s_waitcnt vmcnt(0) 1638; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] 1639; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1640; GFX10-NEXT: s_endpgm 1641 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1642 %tid.ext = sext i32 %tid to i64 1643 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1644 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1645 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1646 %result = add <2 x i16> %x, <i16 -16, i16 -16> 1647 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1648 ret void 1649} 1650 1651define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1652; SI-LABEL: v_test_v2i16_x_add_0_neg16: 1653; SI: ; %bb.0: 1654; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1655; SI-NEXT: s_mov_b32 s7, 0xf000 1656; SI-NEXT: s_mov_b32 s6, 0 1657; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1658; SI-NEXT: v_mov_b32_e32 v1, 0 1659; SI-NEXT: s_waitcnt lgkmcnt(0) 1660; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1661; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1662; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1663; SI-NEXT: s_waitcnt vmcnt(0) 1664; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2 1665; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1666; SI-NEXT: s_endpgm 1667; 1668; VI-LABEL: v_test_v2i16_x_add_0_neg16: 1669; VI: ; %bb.0: 1670; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1671; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1672; VI-NEXT: s_waitcnt lgkmcnt(0) 1673; VI-NEXT: v_mov_b32_e32 v1, s3 1674; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1675; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1676; VI-NEXT: flat_load_dword v0, v[0:1] 1677; VI-NEXT: v_mov_b32_e32 v1, -16 1678; VI-NEXT: v_mov_b32_e32 v3, s1 1679; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1680; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1681; VI-NEXT: s_waitcnt vmcnt(0) 1682; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1683; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1684; VI-NEXT: flat_store_dword v[2:3], v0 1685; VI-NEXT: s_endpgm 1686; 1687; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: 1688; GFX9: ; %bb.0: 1689; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1690; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1691; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1692; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1693; GFX9-NEXT: s_waitcnt vmcnt(0) 1694; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] 1695; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1696; GFX9-NEXT: s_endpgm 1697; 1698; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: 1699; GFX10: ; %bb.0: 1700; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1701; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1702; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1703; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1704; GFX10-NEXT: s_waitcnt vmcnt(0) 1705; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] 1706; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1707; GFX10-NEXT: s_endpgm 1708 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1709 %tid.ext = sext i32 %tid to i64 1710 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1711 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1712 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1713 %result = add <2 x i16> %x, <i16 0, i16 -16> 1714 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1715 ret void 1716} 1717 1718define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1719; SI-LABEL: v_test_v2i16_x_add_neg16_0: 1720; SI: ; %bb.0: 1721; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1722; SI-NEXT: s_mov_b32 s7, 0xf000 1723; SI-NEXT: s_mov_b32 s6, 0 1724; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1725; SI-NEXT: v_mov_b32_e32 v1, 0 1726; SI-NEXT: s_waitcnt lgkmcnt(0) 1727; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1728; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1729; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1730; SI-NEXT: s_waitcnt vmcnt(0) 1731; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2 1732; SI-NEXT: s_mov_b32 s4, 0xffff 1733; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 1734; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1735; SI-NEXT: s_endpgm 1736; 1737; VI-LABEL: v_test_v2i16_x_add_neg16_0: 1738; VI: ; %bb.0: 1739; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1740; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1741; VI-NEXT: s_waitcnt lgkmcnt(0) 1742; VI-NEXT: v_mov_b32_e32 v1, s3 1743; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1744; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1745; VI-NEXT: flat_load_dword v3, v[0:1] 1746; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1747; VI-NEXT: v_mov_b32_e32 v1, s1 1748; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1749; VI-NEXT: s_waitcnt vmcnt(0) 1750; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1751; VI-NEXT: v_add_u16_e32 v3, -16, v3 1752; VI-NEXT: v_or_b32_e32 v2, v3, v2 1753; VI-NEXT: flat_store_dword v[0:1], v2 1754; VI-NEXT: s_endpgm 1755; 1756; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: 1757; GFX9: ; %bb.0: 1758; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1759; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1760; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1761; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1762; GFX9-NEXT: s_waitcnt vmcnt(0) 1763; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 1764; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1765; GFX9-NEXT: s_endpgm 1766; 1767; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: 1768; GFX10: ; %bb.0: 1769; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1770; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1771; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1772; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1773; GFX10-NEXT: s_waitcnt vmcnt(0) 1774; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 1775; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1776; GFX10-NEXT: s_endpgm 1777 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1778 %tid.ext = sext i32 %tid to i64 1779 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1780 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1781 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1782 %result = add <2 x i16> %x, <i16 -16, i16 0> 1783 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1784 ret void 1785} 1786 1787define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1788; SI-LABEL: v_test_v2i16_x_add_neg_fpone: 1789; SI: ; %bb.0: 1790; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1791; SI-NEXT: s_mov_b32 s7, 0xf000 1792; SI-NEXT: s_mov_b32 s6, 0 1793; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1794; SI-NEXT: v_mov_b32_e32 v1, 0 1795; SI-NEXT: s_waitcnt lgkmcnt(0) 1796; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1797; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1798; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1799; SI-NEXT: s_waitcnt vmcnt(0) 1800; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v2 1801; SI-NEXT: s_mov_b32 s4, 0xffff0000 1802; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 1803; SI-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2 1804; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1805; SI-NEXT: s_endpgm 1806; 1807; VI-LABEL: v_test_v2i16_x_add_neg_fpone: 1808; VI: ; %bb.0: 1809; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1810; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1811; VI-NEXT: s_waitcnt lgkmcnt(0) 1812; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1813; VI-NEXT: v_mov_b32_e32 v1, s3 1814; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1815; VI-NEXT: flat_load_dword v3, v[0:1] 1816; VI-NEXT: s_movk_i32 s2, 0xc400 1817; VI-NEXT: v_mov_b32_e32 v4, s2 1818; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1819; VI-NEXT: v_mov_b32_e32 v1, s1 1820; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1821; VI-NEXT: s_waitcnt vmcnt(0) 1822; VI-NEXT: v_add_u16_e32 v2, s2, v3 1823; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1824; VI-NEXT: v_or_b32_e32 v2, v2, v3 1825; VI-NEXT: flat_store_dword v[0:1], v2 1826; VI-NEXT: s_endpgm 1827; 1828; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone: 1829; GFX9: ; %bb.0: 1830; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1831; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1832; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1833; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1834; GFX9-NEXT: s_mov_b32 s2, 0x3c003c00 1835; GFX9-NEXT: s_waitcnt vmcnt(0) 1836; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2 1837; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1838; GFX9-NEXT: s_endpgm 1839; 1840; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone: 1841; GFX10: ; %bb.0: 1842; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1843; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1844; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1845; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1846; GFX10-NEXT: s_waitcnt vmcnt(0) 1847; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0] 1848; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1849; GFX10-NEXT: s_endpgm 1850 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1851 %tid.ext = sext i32 %tid to i64 1852 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1853 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1854 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1855 %result = add <2 x i16> %x, <i16 -15360, i16 -15360> 1856 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1857 ret void 1858} 1859 1860define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1861; SI-LABEL: v_test_v2i16_x_add_neg_negfpone: 1862; SI: ; %bb.0: 1863; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1864; SI-NEXT: s_mov_b32 s7, 0xf000 1865; SI-NEXT: s_mov_b32 s6, 0 1866; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1867; SI-NEXT: v_mov_b32_e32 v1, 0 1868; SI-NEXT: s_waitcnt lgkmcnt(0) 1869; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1870; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1871; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1872; SI-NEXT: s_waitcnt vmcnt(0) 1873; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v2 1874; SI-NEXT: s_mov_b32 s4, 0xffff0000 1875; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 1876; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2 1877; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1878; SI-NEXT: s_endpgm 1879; 1880; VI-LABEL: v_test_v2i16_x_add_neg_negfpone: 1881; VI: ; %bb.0: 1882; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1883; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1884; VI-NEXT: s_waitcnt lgkmcnt(0) 1885; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1886; VI-NEXT: v_mov_b32_e32 v1, s3 1887; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1888; VI-NEXT: flat_load_dword v3, v[0:1] 1889; VI-NEXT: s_movk_i32 s2, 0x4400 1890; VI-NEXT: v_mov_b32_e32 v4, s2 1891; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1892; VI-NEXT: v_mov_b32_e32 v1, s1 1893; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1894; VI-NEXT: s_waitcnt vmcnt(0) 1895; VI-NEXT: v_add_u16_e32 v2, s2, v3 1896; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1897; VI-NEXT: v_or_b32_e32 v2, v2, v3 1898; VI-NEXT: flat_store_dword v[0:1], v2 1899; VI-NEXT: s_endpgm 1900; 1901; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone: 1902; GFX9: ; %bb.0: 1903; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1904; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1905; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1906; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1907; GFX9-NEXT: s_mov_b32 s2, 0xbc00bc00 1908; GFX9-NEXT: s_waitcnt vmcnt(0) 1909; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2 1910; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1911; GFX9-NEXT: s_endpgm 1912; 1913; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone: 1914; GFX10: ; %bb.0: 1915; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1916; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1917; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1918; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1919; GFX10-NEXT: s_waitcnt vmcnt(0) 1920; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0] 1921; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1922; GFX10-NEXT: s_endpgm 1923 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1924 %tid.ext = sext i32 %tid to i64 1925 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1926 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1927 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 1928 %result = add <2 x i16> %x, <i16 17408, i16 17408> 1929 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 1930 ret void 1931} 1932 1933define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 1934; SI-LABEL: v_test_v2i16_x_add_neg_fptwo: 1935; SI: ; %bb.0: 1936; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1937; SI-NEXT: s_mov_b32 s7, 0xf000 1938; SI-NEXT: s_mov_b32 s6, 0 1939; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1940; SI-NEXT: v_mov_b32_e32 v1, 0 1941; SI-NEXT: s_waitcnt lgkmcnt(0) 1942; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1943; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1944; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1945; SI-NEXT: s_waitcnt vmcnt(0) 1946; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v2 1947; SI-NEXT: s_mov_b32 s4, 0xffff0000 1948; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 1949; SI-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2 1950; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1951; SI-NEXT: s_endpgm 1952; 1953; VI-LABEL: v_test_v2i16_x_add_neg_fptwo: 1954; VI: ; %bb.0: 1955; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1956; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1957; VI-NEXT: s_waitcnt lgkmcnt(0) 1958; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1959; VI-NEXT: v_mov_b32_e32 v1, s3 1960; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1961; VI-NEXT: flat_load_dword v3, v[0:1] 1962; VI-NEXT: s_movk_i32 s2, 0x4000 1963; VI-NEXT: v_mov_b32_e32 v4, s2 1964; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1965; VI-NEXT: v_mov_b32_e32 v1, s1 1966; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1967; VI-NEXT: s_waitcnt vmcnt(0) 1968; VI-NEXT: v_add_u16_e32 v2, s2, v3 1969; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1970; VI-NEXT: v_or_b32_e32 v2, v2, v3 1971; VI-NEXT: flat_store_dword v[0:1], v2 1972; VI-NEXT: s_endpgm 1973; 1974; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: 1975; GFX9: ; %bb.0: 1976; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1977; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1978; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1980; GFX9-NEXT: s_mov_b32 s2, 0xc000c000 1981; GFX9-NEXT: s_waitcnt vmcnt(0) 1982; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2 1983; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1984; GFX9-NEXT: s_endpgm 1985; 1986; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: 1987; GFX10: ; %bb.0: 1988; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1989; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1990; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1991; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1992; GFX10-NEXT: s_waitcnt vmcnt(0) 1993; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0] 1994; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1995; GFX10-NEXT: s_endpgm 1996 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1997 %tid.ext = sext i32 %tid to i64 1998 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1999 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 2000 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 2001 %result = add <2 x i16> %x, <i16 16384, i16 16384> 2002 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 2003 ret void 2004} 2005 2006define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 2007; SI-LABEL: v_test_v2i16_x_add_neg_negfptwo: 2008; SI: ; %bb.0: 2009; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2010; SI-NEXT: s_mov_b32 s7, 0xf000 2011; SI-NEXT: s_mov_b32 s6, 0 2012; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2013; SI-NEXT: v_mov_b32_e32 v1, 0 2014; SI-NEXT: s_waitcnt lgkmcnt(0) 2015; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 2016; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2017; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 2018; SI-NEXT: s_waitcnt vmcnt(0) 2019; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v2 2020; SI-NEXT: s_mov_b32 s4, 0xffff0000 2021; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 2022; SI-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2 2023; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2024; SI-NEXT: s_endpgm 2025; 2026; VI-LABEL: v_test_v2i16_x_add_neg_negfptwo: 2027; VI: ; %bb.0: 2028; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2029; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2030; VI-NEXT: s_waitcnt lgkmcnt(0) 2031; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2032; VI-NEXT: v_mov_b32_e32 v1, s3 2033; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2034; VI-NEXT: flat_load_dword v3, v[0:1] 2035; VI-NEXT: s_movk_i32 s2, 0xc000 2036; VI-NEXT: v_mov_b32_e32 v4, s2 2037; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2038; VI-NEXT: v_mov_b32_e32 v1, s1 2039; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2040; VI-NEXT: s_waitcnt vmcnt(0) 2041; VI-NEXT: v_add_u16_e32 v2, s2, v3 2042; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2043; VI-NEXT: v_or_b32_e32 v2, v2, v3 2044; VI-NEXT: flat_store_dword v[0:1], v2 2045; VI-NEXT: s_endpgm 2046; 2047; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: 2048; GFX9: ; %bb.0: 2049; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2050; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2051; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2052; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2053; GFX9-NEXT: s_mov_b32 s2, 0x40004000 2054; GFX9-NEXT: s_waitcnt vmcnt(0) 2055; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2 2056; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2057; GFX9-NEXT: s_endpgm 2058; 2059; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: 2060; GFX10: ; %bb.0: 2061; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2062; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2063; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2064; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 2065; GFX10-NEXT: s_waitcnt vmcnt(0) 2066; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0] 2067; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2068; GFX10-NEXT: s_endpgm 2069 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2070 %tid.ext = sext i32 %tid to i64 2071 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 2072 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 2073 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 2074 %result = add <2 x i16> %x, <i16 -16384, i16 -16384> 2075 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 2076 ret void 2077} 2078 2079define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 2080; SI-LABEL: v_test_v2i16_x_add_undef_neg32: 2081; SI: ; %bb.0: 2082; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2083; SI-NEXT: s_mov_b32 s7, 0xf000 2084; SI-NEXT: s_mov_b32 s6, 0 2085; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2086; SI-NEXT: v_mov_b32_e32 v1, 0 2087; SI-NEXT: s_waitcnt lgkmcnt(0) 2088; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 2089; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2090; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 2091; SI-NEXT: s_waitcnt vmcnt(0) 2092; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 2093; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2 2094; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2095; SI-NEXT: s_endpgm 2096; 2097; VI-LABEL: v_test_v2i16_x_add_undef_neg32: 2098; VI: ; %bb.0: 2099; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2100; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2101; VI-NEXT: s_waitcnt lgkmcnt(0) 2102; VI-NEXT: v_mov_b32_e32 v1, s3 2103; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2104; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2105; VI-NEXT: flat_load_dword v0, v[0:1] 2106; VI-NEXT: v_mov_b32_e32 v1, 32 2107; VI-NEXT: v_mov_b32_e32 v3, s1 2108; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 2109; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2110; VI-NEXT: s_waitcnt vmcnt(0) 2111; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2112; VI-NEXT: flat_store_dword v[2:3], v0 2113; VI-NEXT: s_endpgm 2114; 2115; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: 2116; GFX9: ; %bb.0: 2117; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2118; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2119; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2120; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2121; GFX9-NEXT: s_waitcnt vmcnt(0) 2122; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] 2123; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2124; GFX9-NEXT: s_endpgm 2125; 2126; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: 2127; GFX10: ; %bb.0: 2128; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2129; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2130; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2131; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 2132; GFX10-NEXT: s_waitcnt vmcnt(0) 2133; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] 2134; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2135; GFX10-NEXT: s_endpgm 2136 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2137 %tid.ext = sext i32 %tid to i64 2138 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 2139 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 2140 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 2141 %result = add <2 x i16> %x, <i16 undef, i16 -32> 2142 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 2143 ret void 2144} 2145 2146define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 2147; SI-LABEL: v_test_v2i16_x_add_neg32_undef: 2148; SI: ; %bb.0: 2149; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2150; SI-NEXT: s_mov_b32 s7, 0xf000 2151; SI-NEXT: s_mov_b32 s6, 0 2152; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2153; SI-NEXT: v_mov_b32_e32 v1, 0 2154; SI-NEXT: s_waitcnt lgkmcnt(0) 2155; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 2156; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2157; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 2158; SI-NEXT: s_waitcnt vmcnt(0) 2159; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2 2160; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 2161; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2162; SI-NEXT: s_endpgm 2163; 2164; VI-LABEL: v_test_v2i16_x_add_neg32_undef: 2165; VI: ; %bb.0: 2166; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2167; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2168; VI-NEXT: s_waitcnt lgkmcnt(0) 2169; VI-NEXT: v_mov_b32_e32 v1, s3 2170; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2171; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2172; VI-NEXT: flat_load_dword v0, v[0:1] 2173; VI-NEXT: v_mov_b32_e32 v3, s1 2174; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 2175; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2176; VI-NEXT: s_waitcnt vmcnt(0) 2177; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 2178; VI-NEXT: flat_store_dword v[2:3], v0 2179; VI-NEXT: s_endpgm 2180; 2181; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef: 2182; GFX9: ; %bb.0: 2183; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2184; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2185; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2186; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2187; GFX9-NEXT: s_waitcnt vmcnt(0) 2188; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 2189; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2190; GFX9-NEXT: s_endpgm 2191; 2192; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef: 2193; GFX10: ; %bb.0: 2194; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2195; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2196; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2197; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 2198; GFX10-NEXT: s_waitcnt vmcnt(0) 2199; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 2200; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2201; GFX10-NEXT: s_endpgm 2202 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2203 %tid.ext = sext i32 %tid to i64 2204 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 2205 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 2206 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep 2207 %result = add <2 x i16> %x, <i16 -32, i16 undef> 2208 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out 2209 ret void 2210} 2211 2212declare i32 @llvm.amdgcn.workitem.id.x() #1 2213 2214attributes #0 = { nounwind } 2215attributes #1 = { nounwind readnone } 2216