1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,SICIVI,SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,SICIVI,VI 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,GFX9 5 6 7declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone 8declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone 9 10 11declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 12 13define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 14; SI-LABEL: saddo_i64_zext: 15; SI: ; %bb.0: 16; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 17; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 18; SI-NEXT: s_mov_b32 s3, 0xf000 19; SI-NEXT: s_mov_b32 s2, -1 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: v_mov_b32_e32 v0, s6 22; SI-NEXT: s_add_u32 s10, s6, s8 23; SI-NEXT: s_addc_u32 s11, s7, s9 24; SI-NEXT: v_mov_b32_e32 v1, s7 25; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] 26; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0 27; SI-NEXT: s_mov_b32 s0, s4 28; SI-NEXT: s_mov_b32 s1, s5 29; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc 30; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 31; SI-NEXT: v_mov_b32_e32 v1, s11 32; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 33; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 34; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 35; SI-NEXT: s_endpgm 36; 37; VI-LABEL: saddo_i64_zext: 38; VI: ; %bb.0: 39; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 40; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 41; VI-NEXT: s_waitcnt lgkmcnt(0) 42; VI-NEXT: v_mov_b32_e32 v1, s6 43; VI-NEXT: s_add_u32 s2, s6, s0 44; VI-NEXT: s_addc_u32 s3, s7, s1 45; VI-NEXT: v_mov_b32_e32 v2, s7 46; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] 47; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 48; VI-NEXT: v_mov_b32_e32 v3, s3 49; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc 50; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 51; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 52; VI-NEXT: v_mov_b32_e32 v0, s4 53; VI-NEXT: v_mov_b32_e32 v1, s5 54; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 55; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 56; VI-NEXT: s_endpgm 57; 58; GFX9-LABEL: saddo_i64_zext: 59; GFX9: ; %bb.0: 60; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 61; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 62; GFX9-NEXT: v_mov_b32_e32 v2, 0 63; GFX9-NEXT: s_waitcnt lgkmcnt(0) 64; GFX9-NEXT: v_mov_b32_e32 v0, s6 65; GFX9-NEXT: s_add_u32 s2, s6, s0 66; GFX9-NEXT: v_mov_b32_e32 v1, s7 67; GFX9-NEXT: s_addc_u32 s3, s7, s1 68; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 69; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 70; GFX9-NEXT: v_mov_b32_e32 v1, s3 71; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc 72; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 73; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 74; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 75; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 76; GFX9-NEXT: s_endpgm 77 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 78 %val = extractvalue { i64, i1 } %sadd, 0 79 %carry = extractvalue { i64, i1 } %sadd, 1 80 %ext = zext i1 %carry to i64 81 %add2 = add i64 %val, %ext 82 store i64 %add2, i64 addrspace(1)* %out, align 8 83 ret void 84} 85 86define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { 87; SI-LABEL: s_saddo_i32: 88; SI: ; %bb.0: 89; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 90; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 91; SI-NEXT: s_mov_b32 s3, 0xf000 92; SI-NEXT: s_mov_b32 s2, -1 93; SI-NEXT: s_waitcnt lgkmcnt(0) 94; SI-NEXT: s_mov_b32 s0, s4 95; SI-NEXT: v_cmp_lt_i32_e64 s[10:11], s9, 0 96; SI-NEXT: s_add_i32 s9, s8, s9 97; SI-NEXT: v_mov_b32_e32 v0, s8 98; SI-NEXT: s_mov_b32 s1, s5 99; SI-NEXT: v_cmp_lt_i32_e32 vcc, s9, v0 100; SI-NEXT: v_mov_b32_e32 v0, s9 101; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 102; SI-NEXT: s_xor_b64 s[0:1], s[10:11], vcc 103; SI-NEXT: s_mov_b32 s4, s6 104; SI-NEXT: s_mov_b32 s5, s7 105; SI-NEXT: s_mov_b32 s6, s2 106; SI-NEXT: s_mov_b32 s7, s3 107; SI-NEXT: s_waitcnt expcnt(0) 108; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 109; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 110; SI-NEXT: s_endpgm 111; 112; VI-LABEL: s_saddo_i32: 113; VI: ; %bb.0: 114; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 115; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s4 118; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 119; VI-NEXT: s_add_i32 s1, s0, s1 120; VI-NEXT: v_mov_b32_e32 v4, s0 121; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 122; VI-NEXT: v_mov_b32_e32 v4, s1 123; VI-NEXT: v_mov_b32_e32 v1, s5 124; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc 125; VI-NEXT: flat_store_dword v[0:1], v4 126; VI-NEXT: v_mov_b32_e32 v2, s6 127; VI-NEXT: v_mov_b32_e32 v3, s7 128; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 129; VI-NEXT: flat_store_byte v[2:3], v0 130; VI-NEXT: s_endpgm 131; 132; GFX9-LABEL: s_saddo_i32: 133; GFX9: ; %bb.0: 134; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 135; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 136; GFX9-NEXT: v_mov_b32_e32 v0, 0 137; GFX9-NEXT: s_waitcnt lgkmcnt(0) 138; GFX9-NEXT: v_mov_b32_e32 v1, s1 139; GFX9-NEXT: s_add_i32 s1, s0, s1 140; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp 141; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 142; GFX9-NEXT: v_mov_b32_e32 v2, s1 143; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 144; GFX9-NEXT: global_store_dword v0, v2, s[4:5] 145; GFX9-NEXT: global_store_byte v0, v1, s[6:7] 146; GFX9-NEXT: s_endpgm 147 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind 148 %val = extractvalue { i32, i1 } %sadd, 0 149 %carry = extractvalue { i32, i1 } %sadd, 1 150 store i32 %val, i32 addrspace(1)* %out, align 4 151 store i1 %carry, i1 addrspace(1)* %carryout 152 ret void 153} 154 155define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { 156; SI-LABEL: v_saddo_i32: 157; SI: ; %bb.0: 158; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 159; SI-NEXT: s_mov_b32 s11, 0xf000 160; SI-NEXT: s_mov_b32 s10, -1 161; SI-NEXT: s_mov_b32 s14, s10 162; SI-NEXT: s_mov_b32 s15, s11 163; SI-NEXT: s_waitcnt lgkmcnt(0) 164; SI-NEXT: s_mov_b32 s12, s4 165; SI-NEXT: s_mov_b32 s13, s5 166; SI-NEXT: s_mov_b32 s4, s6 167; SI-NEXT: s_mov_b32 s5, s7 168; SI-NEXT: s_mov_b32 s6, s10 169; SI-NEXT: s_mov_b32 s7, s11 170; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 171; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 172; SI-NEXT: s_mov_b32 s8, s0 173; SI-NEXT: s_mov_b32 s9, s1 174; SI-NEXT: s_mov_b32 s4, s2 175; SI-NEXT: s_mov_b32 s5, s3 176; SI-NEXT: s_waitcnt vmcnt(0) 177; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0 178; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 179; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 180; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 181; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 182; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 183; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 184; SI-NEXT: s_endpgm 185; 186; VI-LABEL: v_saddo_i32: 187; VI: ; %bb.0: 188; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 189; VI-NEXT: s_waitcnt lgkmcnt(0) 190; VI-NEXT: v_mov_b32_e32 v0, s4 191; VI-NEXT: v_mov_b32_e32 v1, s5 192; VI-NEXT: v_mov_b32_e32 v2, s6 193; VI-NEXT: v_mov_b32_e32 v3, s7 194; VI-NEXT: flat_load_dword v4, v[0:1] 195; VI-NEXT: flat_load_dword v5, v[2:3] 196; VI-NEXT: v_mov_b32_e32 v0, s0 197; VI-NEXT: v_mov_b32_e32 v1, s1 198; VI-NEXT: v_mov_b32_e32 v2, s2 199; VI-NEXT: v_mov_b32_e32 v3, s3 200; VI-NEXT: s_waitcnt vmcnt(0) 201; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v4 202; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 203; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 204; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 205; VI-NEXT: flat_store_dword v[0:1], v6 206; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 207; VI-NEXT: flat_store_byte v[2:3], v0 208; VI-NEXT: s_endpgm 209; 210; GFX9-LABEL: v_saddo_i32: 211; GFX9: ; %bb.0: 212; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 213; GFX9-NEXT: v_mov_b32_e32 v0, 0 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: global_load_dword v1, v0, s[4:5] 216; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 217; GFX9-NEXT: s_waitcnt vmcnt(0) 218; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp 219; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 220; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 221; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 222; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 223; GFX9-NEXT: global_store_byte v0, v1, s[2:3] 224; GFX9-NEXT: s_endpgm 225 %a = load i32, i32 addrspace(1)* %aptr, align 4 226 %b = load i32, i32 addrspace(1)* %bptr, align 4 227 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind 228 %val = extractvalue { i32, i1 } %sadd, 0 229 %carry = extractvalue { i32, i1 } %sadd, 1 230 store i32 %val, i32 addrspace(1)* %out, align 4 231 store i1 %carry, i1 addrspace(1)* %carryout 232 ret void 233} 234 235define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { 236; SI-LABEL: s_saddo_i64: 237; SI: ; %bb.0: 238; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 239; SI-NEXT: s_mov_b32 s11, 0xf000 240; SI-NEXT: s_mov_b32 s10, -1 241; SI-NEXT: s_waitcnt lgkmcnt(0) 242; SI-NEXT: s_add_u32 s12, s4, s6 243; SI-NEXT: v_mov_b32_e32 v0, s4 244; SI-NEXT: s_addc_u32 s13, s5, s7 245; SI-NEXT: v_mov_b32_e32 v1, s5 246; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] 247; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 248; SI-NEXT: v_mov_b32_e32 v0, s12 249; SI-NEXT: s_mov_b32 s8, s0 250; SI-NEXT: s_mov_b32 s9, s1 251; SI-NEXT: v_mov_b32_e32 v1, s13 252; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc 253; SI-NEXT: s_mov_b32 s0, s2 254; SI-NEXT: s_mov_b32 s1, s3 255; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 256; SI-NEXT: s_mov_b32 s2, s10 257; SI-NEXT: s_mov_b32 s3, s11 258; SI-NEXT: s_waitcnt expcnt(0) 259; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 260; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 261; SI-NEXT: s_endpgm 262; 263; VI-LABEL: s_saddo_i64: 264; VI: ; %bb.0: 265; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 266; VI-NEXT: s_waitcnt lgkmcnt(0) 267; VI-NEXT: v_mov_b32_e32 v0, s0 268; VI-NEXT: v_mov_b32_e32 v4, s4 269; VI-NEXT: s_add_u32 s0, s4, s6 270; VI-NEXT: v_mov_b32_e32 v1, s1 271; VI-NEXT: s_addc_u32 s1, s5, s7 272; VI-NEXT: v_mov_b32_e32 v5, s5 273; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] 274; VI-NEXT: v_mov_b32_e32 v2, s2 275; VI-NEXT: v_mov_b32_e32 v3, s3 276; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 277; VI-NEXT: v_mov_b32_e32 v5, s1 278; VI-NEXT: v_mov_b32_e32 v4, s0 279; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc 280; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] 281; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 282; VI-NEXT: flat_store_byte v[2:3], v0 283; VI-NEXT: s_endpgm 284; 285; GFX9-LABEL: s_saddo_i64: 286; GFX9: ; %bb.0: 287; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 288; GFX9-NEXT: v_mov_b32_e32 v2, 0 289; GFX9-NEXT: s_waitcnt lgkmcnt(0) 290; GFX9-NEXT: s_add_u32 s8, s4, s6 291; GFX9-NEXT: v_mov_b32_e32 v0, s4 292; GFX9-NEXT: v_mov_b32_e32 v1, s5 293; GFX9-NEXT: s_addc_u32 s9, s5, s7 294; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 295; GFX9-NEXT: v_mov_b32_e32 v0, s8 296; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0 297; GFX9-NEXT: v_mov_b32_e32 v1, s9 298; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 299; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc 300; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 301; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 302; GFX9-NEXT: s_endpgm 303 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 304 %val = extractvalue { i64, i1 } %sadd, 0 305 %carry = extractvalue { i64, i1 } %sadd, 1 306 store i64 %val, i64 addrspace(1)* %out, align 8 307 store i1 %carry, i1 addrspace(1)* %carryout 308 ret void 309} 310 311define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 312; SI-LABEL: v_saddo_i64: 313; SI: ; %bb.0: 314; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 315; SI-NEXT: s_mov_b32 s11, 0xf000 316; SI-NEXT: s_mov_b32 s10, -1 317; SI-NEXT: s_mov_b32 s14, s10 318; SI-NEXT: s_mov_b32 s15, s11 319; SI-NEXT: s_waitcnt lgkmcnt(0) 320; SI-NEXT: s_mov_b32 s12, s4 321; SI-NEXT: s_mov_b32 s13, s5 322; SI-NEXT: s_mov_b32 s4, s6 323; SI-NEXT: s_mov_b32 s5, s7 324; SI-NEXT: s_mov_b32 s6, s10 325; SI-NEXT: s_mov_b32 s7, s11 326; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 327; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 328; SI-NEXT: s_mov_b32 s8, s0 329; SI-NEXT: s_mov_b32 s9, s1 330; SI-NEXT: s_mov_b32 s4, s2 331; SI-NEXT: s_mov_b32 s5, s3 332; SI-NEXT: s_waitcnt vmcnt(0) 333; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 334; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 335; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 336; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] 337; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 338; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 339; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 340; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 341; SI-NEXT: s_endpgm 342; 343; VI-LABEL: v_saddo_i64: 344; VI: ; %bb.0: 345; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 346; VI-NEXT: s_waitcnt lgkmcnt(0) 347; VI-NEXT: v_mov_b32_e32 v0, s4 348; VI-NEXT: v_mov_b32_e32 v1, s5 349; VI-NEXT: v_mov_b32_e32 v2, s6 350; VI-NEXT: v_mov_b32_e32 v3, s7 351; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 352; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 353; VI-NEXT: v_mov_b32_e32 v4, s0 354; VI-NEXT: v_mov_b32_e32 v5, s1 355; VI-NEXT: v_mov_b32_e32 v6, s2 356; VI-NEXT: v_mov_b32_e32 v7, s3 357; VI-NEXT: s_waitcnt vmcnt(0) 358; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 359; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc 360; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 361; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] 362; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] 363; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 364; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 365; VI-NEXT: flat_store_byte v[6:7], v0 366; VI-NEXT: s_endpgm 367; 368; GFX9-LABEL: v_saddo_i64: 369; GFX9: ; %bb.0: 370; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 371; GFX9-NEXT: v_mov_b32_e32 v6, 0 372; GFX9-NEXT: s_waitcnt lgkmcnt(0) 373; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] 374; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] 375; GFX9-NEXT: s_waitcnt vmcnt(0) 376; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 377; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc 378; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 379; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] 380; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] 381; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 382; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 383; GFX9-NEXT: global_store_byte v6, v0, s[6:7] 384; GFX9-NEXT: s_endpgm 385 %a = load i64, i64 addrspace(1)* %aptr, align 4 386 %b = load i64, i64 addrspace(1)* %bptr, align 4 387 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind 388 %val = extractvalue { i64, i1 } %sadd, 0 389 %carry = extractvalue { i64, i1 } %sadd, 1 390 store i64 %val, i64 addrspace(1)* %out, align 8 391 store i1 %carry, i1 addrspace(1)* %carryout 392 ret void 393} 394 395define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { 396; SI-LABEL: v_saddo_v2i32: 397; SI: ; %bb.0: 398; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 399; SI-NEXT: s_mov_b32 s11, 0xf000 400; SI-NEXT: s_mov_b32 s10, -1 401; SI-NEXT: s_mov_b32 s14, s10 402; SI-NEXT: s_mov_b32 s15, s11 403; SI-NEXT: s_waitcnt lgkmcnt(0) 404; SI-NEXT: s_mov_b32 s12, s4 405; SI-NEXT: s_mov_b32 s13, s5 406; SI-NEXT: s_mov_b32 s4, s6 407; SI-NEXT: s_mov_b32 s5, s7 408; SI-NEXT: s_mov_b32 s6, s10 409; SI-NEXT: s_mov_b32 s7, s11 410; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 411; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 412; SI-NEXT: s_mov_b32 s8, s0 413; SI-NEXT: s_mov_b32 s9, s1 414; SI-NEXT: s_mov_b32 s12, s2 415; SI-NEXT: s_mov_b32 s13, s3 416; SI-NEXT: s_waitcnt vmcnt(0) 417; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3 418; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 419; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 420; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 421; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] 422; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 423; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 424; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 425; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] 426; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 427; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 428; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 429; SI-NEXT: s_endpgm 430; 431; VI-LABEL: v_saddo_v2i32: 432; VI: ; %bb.0: 433; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 434; VI-NEXT: s_waitcnt lgkmcnt(0) 435; VI-NEXT: v_mov_b32_e32 v0, s4 436; VI-NEXT: v_mov_b32_e32 v1, s5 437; VI-NEXT: v_mov_b32_e32 v2, s6 438; VI-NEXT: v_mov_b32_e32 v3, s7 439; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 440; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 441; VI-NEXT: v_mov_b32_e32 v4, s0 442; VI-NEXT: v_mov_b32_e32 v5, s1 443; VI-NEXT: v_mov_b32_e32 v6, s2 444; VI-NEXT: v_mov_b32_e32 v7, s3 445; VI-NEXT: s_waitcnt vmcnt(0) 446; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 447; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 448; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 449; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 450; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] 451; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 452; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 453; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 454; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] 455; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 456; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] 457; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] 458; VI-NEXT: s_endpgm 459; 460; GFX9-LABEL: v_saddo_v2i32: 461; GFX9: ; %bb.0: 462; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 463; GFX9-NEXT: v_mov_b32_e32 v4, 0 464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 465; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] 466; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 467; GFX9-NEXT: s_waitcnt vmcnt(0) 468; GFX9-NEXT: v_add_i32 v5, v0, v2 clamp 469; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 470; GFX9-NEXT: v_add_i32 v2, v1, v3 clamp 471; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 472; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 473; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 474; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 475; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v5 476; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 477; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] 478; GFX9-NEXT: s_endpgm 479 %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 480 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 481 %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind 482 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 483 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 484 store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 485 %carry.ext = zext <2 x i1> %carry to <2 x i32> 486 store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout 487 ret void 488} 489