1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3 4define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) { 5; GCN-LABEL: v_shl_i128_vv: 6; GCN: ; %bb.0: 7; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 9; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v4 10; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 11; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 12; GCN-NEXT: v_or_b32_e32 v7, v5, v7 13; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 14; GCN-NEXT: v_or_b32_e32 v8, v6, v8 15; GCN-NEXT: v_lshl_b64 v[5:6], v[0:1], v5 16; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 17; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 18; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 19; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] 20; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 21; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] 22; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 23; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 24; GCN-NEXT: s_setpc_b64 s[30:31] 25 %shl = shl i128 %lhs, %rhs 26 ret i128 %shl 27} 28 29define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) { 30; GCN-LABEL: v_lshr_i128_vv: 31; GCN: ; %bb.0: 32; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 34; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4 35; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v7 36; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 37; GCN-NEXT: v_or_b32_e32 v7, v5, v7 38; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 39; GCN-NEXT: v_or_b32_e32 v8, v6, v8 40; GCN-NEXT: v_lshr_b64 v[5:6], v[2:3], v5 41; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 42; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 43; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 44; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] 45; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 46; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] 47; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 48; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 49; GCN-NEXT: s_setpc_b64 s[30:31] 50 51 %shl = lshr i128 %lhs, %rhs 52 ret i128 %shl 53} 54 55define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) { 56; GCN-LABEL: v_ashr_i128_vv: 57; GCN: ; %bb.0: 58; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 59; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 60; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4 61; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v7 62; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 63; GCN-NEXT: v_or_b32_e32 v7, v5, v7 64; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 65; GCN-NEXT: v_or_b32_e32 v8, v6, v8 66; GCN-NEXT: v_ashr_i64 v[5:6], v[2:3], v5 67; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 68; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 69; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] 70; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 71; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] 72; GCN-NEXT: v_ashr_i64 v[4:5], v[2:3], v4 73; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 74; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 75; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 76; GCN-NEXT: s_setpc_b64 s[30:31] 77 %shl = ashr i128 %lhs, %rhs 78 ret i128 %shl 79} 80 81 82define i128 @v_shl_i128_vk(i128 %lhs) { 83; GCN-LABEL: v_shl_i128_vk: 84; GCN: ; %bb.0: 85; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 86; GCN-NEXT: v_alignbit_b32 v4, v2, v1, 15 87; GCN-NEXT: v_alignbit_b32 v1, v1, v0, 15 88; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 15 89; GCN-NEXT: v_lshlrev_b32_e32 v0, 17, v0 90; GCN-NEXT: v_mov_b32_e32 v2, v4 91; GCN-NEXT: s_setpc_b64 s[30:31] 92 %shl = shl i128 %lhs, 17 93 ret i128 %shl 94} 95 96define i128 @v_lshr_i128_vk(i128 %lhs) { 97; GCN-LABEL: v_lshr_i128_vk: 98; GCN: ; %bb.0: 99; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 100; GCN-NEXT: v_alignbit_b32 v0, v3, v2, 1 101; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v3 102; GCN-NEXT: v_mov_b32_e32 v2, 0 103; GCN-NEXT: v_mov_b32_e32 v3, 0 104; GCN-NEXT: s_setpc_b64 s[30:31] 105 %shl = lshr i128 %lhs, 65 106 ret i128 %shl 107} 108 109define i128 @v_ashr_i128_vk(i128 %lhs) { 110; GCN-LABEL: v_ashr_i128_vk: 111; GCN: ; %bb.0: 112; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 113; GCN-NEXT: v_ashr_i64 v[4:5], v[2:3], 33 114; GCN-NEXT: v_alignbit_b32 v0, v2, v1, 1 115; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 1 116; GCN-NEXT: v_mov_b32_e32 v2, v4 117; GCN-NEXT: v_mov_b32_e32 v3, v5 118; GCN-NEXT: s_setpc_b64 s[30:31] 119 %shl = ashr i128 %lhs, 33 120 ret i128 %shl 121} 122 123define i128 @v_shl_i128_kv(i128 %rhs) { 124; GCN-LABEL: v_shl_i128_kv: 125; GCN: ; %bb.0: 126; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 127; GCN-NEXT: v_sub_i32_e32 v1, vcc, 64, v0 128; GCN-NEXT: v_lshr_b64 v[2:3], 17, v1 129; GCN-NEXT: v_subrev_i32_e32 v1, vcc, 64, v0 130; GCN-NEXT: v_lshl_b64 v[4:5], 17, v1 131; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 132; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc 133; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 134; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[4:5] 135; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0 136; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 137; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] 138; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 139; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 140; GCN-NEXT: s_setpc_b64 s[30:31] 141 %shl = shl i128 17, %rhs 142 ret i128 %shl 143} 144 145define i128 @v_lshr_i128_kv(i128 %rhs) { 146; GCN-LABEL: v_lshr_i128_kv: 147; GCN: ; %bb.0: 148; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GCN-NEXT: s_movk_i32 s4, 0x41 150; GCN-NEXT: s_mov_b32 s5, 0 151; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 152; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 153; GCN-NEXT: v_mov_b32_e32 v3, s4 154; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 155; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 156; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc 157; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] 158; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc 159; GCN-NEXT: v_mov_b32_e32 v2, 0 160; GCN-NEXT: v_mov_b32_e32 v3, 0 161; GCN-NEXT: s_setpc_b64 s[30:31] 162 %shl = lshr i128 65, %rhs 163 ret i128 %shl 164} 165 166define i128 @v_ashr_i128_kv(i128 %rhs) { 167; GCN-LABEL: v_ashr_i128_kv: 168; GCN: ; %bb.0: 169; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 170; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 171; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 172; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 173; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 174; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc 175; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5] 176; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc 177; GCN-NEXT: v_mov_b32_e32 v2, 0 178; GCN-NEXT: v_mov_b32_e32 v3, 0 179; GCN-NEXT: s_setpc_b64 s[30:31] 180 %shl = ashr i128 33, %rhs 181 ret i128 %shl 182} 183 184define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { 185; GCN-LABEL: s_shl_i128_ss: 186; GCN: ; %bb.0: 187; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 188; GCN-NEXT: v_mov_b32_e32 v4, 0 189; GCN-NEXT: v_mov_b32_e32 v5, 0 190; GCN-NEXT: s_waitcnt lgkmcnt(0) 191; GCN-NEXT: s_sub_i32 s9, 64, s8 192; GCN-NEXT: s_sub_i32 s2, s8, 64 193; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s8 194; GCN-NEXT: s_lshr_b64 s[10:11], s[4:5], s9 195; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] 196; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], s2 197; GCN-NEXT: v_mov_b32_e32 v0, s3 198; GCN-NEXT: v_mov_b32_e32 v1, s11 199; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 200; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 201; GCN-NEXT: v_mov_b32_e32 v1, s7 202; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 203; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] 204; GCN-NEXT: v_mov_b32_e32 v0, s2 205; GCN-NEXT: v_mov_b32_e32 v1, s10 206; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 207; GCN-NEXT: v_mov_b32_e32 v1, s6 208; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 209; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], s8 210; GCN-NEXT: v_mov_b32_e32 v0, s1 211; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 212; GCN-NEXT: v_mov_b32_e32 v0, s0 213; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 214; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 215; GCN-NEXT: s_endpgm 216 %shift = shl i128 %lhs, %rhs 217 store i128 %shift, i128 addrspace(1)* null 218 ret void 219} 220 221define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { 222; GCN-LABEL: s_lshr_i128_ss: 223; GCN: ; %bb.0: 224; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 225; GCN-NEXT: v_mov_b32_e32 v4, 0 226; GCN-NEXT: v_mov_b32_e32 v5, 0 227; GCN-NEXT: s_waitcnt lgkmcnt(0) 228; GCN-NEXT: s_sub_i32 s9, 64, s8 229; GCN-NEXT: s_sub_i32 s2, s8, 64 230; GCN-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 231; GCN-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 232; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] 233; GCN-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 234; GCN-NEXT: v_mov_b32_e32 v0, s3 235; GCN-NEXT: v_mov_b32_e32 v1, s11 236; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 237; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 238; GCN-NEXT: v_mov_b32_e32 v1, s5 239; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 240; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 241; GCN-NEXT: v_mov_b32_e32 v0, s2 242; GCN-NEXT: v_mov_b32_e32 v2, s10 243; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 244; GCN-NEXT: v_mov_b32_e32 v2, s4 245; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 246; GCN-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 247; GCN-NEXT: v_mov_b32_e32 v2, s1 248; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc 249; GCN-NEXT: v_mov_b32_e32 v2, s0 250; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 251; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 252; GCN-NEXT: s_endpgm 253 %shift = lshr i128 %lhs, %rhs 254 store i128 %shift, i128 addrspace(1)* null 255 ret void 256} 257 258define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { 259; GCN-LABEL: s_ashr_i128_ss: 260; GCN: ; %bb.0: 261; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 262; GCN-NEXT: s_waitcnt lgkmcnt(0) 263; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], s8 264; GCN-NEXT: s_ashr_i32 s2, s7, 31 265; GCN-NEXT: v_mov_b32_e32 v2, s0 266; GCN-NEXT: s_sub_i32 s0, s8, 64 267; GCN-NEXT: v_mov_b32_e32 v0, s2 268; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], s0 269; GCN-NEXT: s_sub_i32 s0, 64, s8 270; GCN-NEXT: v_mov_b32_e32 v1, s1 271; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s0 272; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 273; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 274; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[0:1] 275; GCN-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 276; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc 277; GCN-NEXT: v_mov_b32_e32 v0, s3 278; GCN-NEXT: v_mov_b32_e32 v1, s7 279; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 280; GCN-NEXT: v_mov_b32_e32 v1, s5 281; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 282; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 283; GCN-NEXT: v_mov_b32_e32 v0, s2 284; GCN-NEXT: v_mov_b32_e32 v4, s6 285; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 286; GCN-NEXT: v_mov_b32_e32 v4, s4 287; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 288; GCN-NEXT: v_mov_b32_e32 v4, 0 289; GCN-NEXT: v_mov_b32_e32 v5, 0 290; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 291; GCN-NEXT: s_endpgm 292 %shift = ashr i128 %lhs, %rhs 293 store i128 %shift, i128 addrspace(1)* null 294 ret void 295} 296 297define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { 298; GCN-LABEL: v_shl_v2i128_vv: 299; GCN: ; %bb.0: 300; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 301; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 302; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 303; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 304; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] 305; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] 306; GCN-NEXT: v_or_b32_e32 v11, v9, v11 307; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 308; GCN-NEXT: v_or_b32_e32 v10, v8, v10 309; GCN-NEXT: v_or_b32_e32 v19, v19, v17 310; GCN-NEXT: v_or_b32_e32 v18, v18, v16 311; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 312; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 313; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 314; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] 315; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc 316; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 317; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] 318; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9 319; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 320; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] 321; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc 322; GCN-NEXT: v_or_b32_e32 v16, v16, v9 323; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] 324; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 325; GCN-NEXT: v_or_b32_e32 v11, v17, v10 326; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 327; GCN-NEXT: v_or_b32_e32 v15, v13, v15 328; GCN-NEXT: v_or_b32_e32 v14, v12, v14 329; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] 330; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] 331; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 332; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12 333; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc 334; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7] 335; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc 336; GCN-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[6:7] 337; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] 338; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] 339; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 340; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc 341; GCN-NEXT: s_setpc_b64 s[30:31] 342 %shl = shl <2 x i128> %lhs, %rhs 343 ret <2 x i128> %shl 344} 345 346define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { 347; GCN-LABEL: v_lshr_v2i128_vv: 348; GCN: ; %bb.0: 349; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 350; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 351; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 352; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 353; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] 354; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] 355; GCN-NEXT: v_or_b32_e32 v11, v9, v11 356; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 357; GCN-NEXT: v_or_b32_e32 v10, v8, v10 358; GCN-NEXT: v_or_b32_e32 v19, v19, v17 359; GCN-NEXT: v_or_b32_e32 v18, v18, v16 360; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 361; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 362; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 363; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] 364; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc 365; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 366; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] 367; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 368; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 369; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] 370; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc 371; GCN-NEXT: v_or_b32_e32 v16, v16, v9 372; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] 373; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 374; GCN-NEXT: v_or_b32_e32 v11, v17, v10 375; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9 376; GCN-NEXT: v_or_b32_e32 v15, v13, v15 377; GCN-NEXT: v_or_b32_e32 v14, v12, v14 378; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] 379; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] 380; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8 381; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 382; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc 383; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] 384; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc 385; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] 386; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] 387; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] 388; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc 389; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc 390; GCN-NEXT: s_setpc_b64 s[30:31] 391 %shl = lshr <2 x i128> %lhs, %rhs 392 ret <2 x i128> %shl 393} 394 395define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { 396; GCN-LABEL: v_ashr_v2i128_vv: 397; GCN: ; %bb.0: 398; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 399; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 400; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 401; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 402; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] 403; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] 404; GCN-NEXT: v_or_b32_e32 v11, v9, v11 405; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 406; GCN-NEXT: v_or_b32_e32 v10, v8, v10 407; GCN-NEXT: v_or_b32_e32 v19, v19, v17 408; GCN-NEXT: v_or_b32_e32 v18, v18, v16 409; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 410; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 411; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 412; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] 413; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc 414; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 415; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] 416; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 417; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 418; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] 419; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc 420; GCN-NEXT: v_or_b32_e32 v16, v16, v9 421; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] 422; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 423; GCN-NEXT: v_or_b32_e32 v11, v17, v10 424; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9 425; GCN-NEXT: v_or_b32_e32 v15, v13, v15 426; GCN-NEXT: v_or_b32_e32 v14, v12, v14 427; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] 428; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] 429; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc 430; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] 431; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc 432; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] 433; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8 434; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 435; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v8, s[4:5] 436; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] 437; GCN-NEXT: v_ashr_i64 v[8:9], v[6:7], v12 438; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 439; GCN-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc 440; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 441; GCN-NEXT: s_setpc_b64 s[30:31] 442 %shl = ashr <2 x i128> %lhs, %rhs 443 ret <2 x i128> %shl 444} 445 446define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { 447; GCN-LABEL: s_shl_v2i128ss: 448; GCN: ; %bb.0: 449; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 450; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 451; GCN-NEXT: v_mov_b32_e32 v10, 16 452; GCN-NEXT: v_mov_b32_e32 v8, 0 453; GCN-NEXT: v_mov_b32_e32 v11, 0 454; GCN-NEXT: s_waitcnt lgkmcnt(0) 455; GCN-NEXT: s_sub_i32 s6, 64, s16 456; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 457; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 458; GCN-NEXT: s_sub_i32 s4, s16, 64 459; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6 460; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16 461; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] 462; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] 463; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 464; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] 465; GCN-NEXT: v_mov_b32_e32 v0, s5 466; GCN-NEXT: v_mov_b32_e32 v1, s7 467; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 468; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 469; GCN-NEXT: v_mov_b32_e32 v1, s11 470; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] 471; GCN-NEXT: v_mov_b32_e32 v0, s4 472; GCN-NEXT: v_mov_b32_e32 v1, s6 473; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 474; GCN-NEXT: v_mov_b32_e32 v1, s10 475; GCN-NEXT: s_sub_i32 s6, 64, s20 476; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 477; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 478; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 479; GCN-NEXT: s_sub_i32 s4, s20, 64 480; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6 481; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20 482; GCN-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 483; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] 484; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] 485; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] 486; GCN-NEXT: v_mov_b32_e32 v0, s5 487; GCN-NEXT: v_mov_b32_e32 v1, s7 488; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 489; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] 490; GCN-NEXT: v_mov_b32_e32 v1, s15 491; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3] 492; GCN-NEXT: v_mov_b32_e32 v0, s4 493; GCN-NEXT: v_mov_b32_e32 v1, s6 494; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] 495; GCN-NEXT: v_mov_b32_e32 v1, s14 496; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3] 497; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16 498; GCN-NEXT: v_mov_b32_e32 v0, s3 499; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 500; GCN-NEXT: v_mov_b32_e32 v0, s2 501; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s20 502; GCN-NEXT: v_mov_b32_e32 v4, s3 503; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] 504; GCN-NEXT: v_mov_b32_e32 v4, s2 505; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] 506; GCN-NEXT: v_mov_b32_e32 v9, 0 507; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 508; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 509; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 510; GCN-NEXT: s_endpgm 511 %shift = shl <2 x i128> %lhs, %rhs 512 store <2 x i128> %shift, <2 x i128> addrspace(1)* null 513 ret void 514} 515 516define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { 517; GCN-LABEL: s_lshr_v2i128_ss: 518; GCN: ; %bb.0: 519; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 520; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 521; GCN-NEXT: v_mov_b32_e32 v10, 16 522; GCN-NEXT: v_mov_b32_e32 v8, 0 523; GCN-NEXT: v_mov_b32_e32 v11, 0 524; GCN-NEXT: s_waitcnt lgkmcnt(0) 525; GCN-NEXT: s_sub_i32 s6, 64, s16 526; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 527; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 528; GCN-NEXT: s_sub_i32 s4, s16, 64 529; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 530; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 531; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] 532; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] 533; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] 534; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 535; GCN-NEXT: v_mov_b32_e32 v0, s5 536; GCN-NEXT: v_mov_b32_e32 v1, s7 537; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 538; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 539; GCN-NEXT: v_mov_b32_e32 v1, s9 540; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 541; GCN-NEXT: v_mov_b32_e32 v0, s4 542; GCN-NEXT: v_mov_b32_e32 v2, s6 543; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 544; GCN-NEXT: v_mov_b32_e32 v2, s8 545; GCN-NEXT: s_sub_i32 s6, 64, s20 546; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 547; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 548; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 549; GCN-NEXT: s_sub_i32 s4, s20, 64 550; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 551; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 552; GCN-NEXT: s_lshr_b64 s[4:5], s[14:15], s4 553; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] 554; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] 555; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] 556; GCN-NEXT: v_mov_b32_e32 v2, s5 557; GCN-NEXT: v_mov_b32_e32 v3, s7 558; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 559; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 560; GCN-NEXT: v_mov_b32_e32 v3, s13 561; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] 562; GCN-NEXT: v_mov_b32_e32 v2, s4 563; GCN-NEXT: v_mov_b32_e32 v3, s6 564; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 565; GCN-NEXT: v_mov_b32_e32 v3, s12 566; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] 567; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16 568; GCN-NEXT: v_mov_b32_e32 v2, s3 569; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc 570; GCN-NEXT: v_mov_b32_e32 v2, s2 571; GCN-NEXT: s_lshr_b64 s[2:3], s[14:15], s20 572; GCN-NEXT: v_mov_b32_e32 v6, s3 573; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] 574; GCN-NEXT: v_mov_b32_e32 v6, s2 575; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] 576; GCN-NEXT: v_mov_b32_e32 v9, 0 577; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 578; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 579; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 580; GCN-NEXT: s_endpgm 581 %shift = lshr <2 x i128> %lhs, %rhs 582 store <2 x i128> %shift, <2 x i128> addrspace(1)* null 583 ret void 584} 585 586define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { 587; GCN-LABEL: s_ashr_v2i128_ss: 588; GCN: ; %bb.0: 589; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 590; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 591; GCN-NEXT: v_mov_b32_e32 v8, 0 592; GCN-NEXT: v_mov_b32_e32 v9, 0 593; GCN-NEXT: s_waitcnt lgkmcnt(0) 594; GCN-NEXT: s_sub_i32 s6, 64, s16 595; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 596; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 597; GCN-NEXT: s_sub_i32 s4, s16, 64 598; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 599; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 600; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] 601; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] 602; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] 603; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 604; GCN-NEXT: v_mov_b32_e32 v0, s5 605; GCN-NEXT: v_mov_b32_e32 v1, s7 606; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 607; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 608; GCN-NEXT: v_mov_b32_e32 v1, s9 609; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 610; GCN-NEXT: v_mov_b32_e32 v0, s4 611; GCN-NEXT: v_mov_b32_e32 v2, s6 612; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 613; GCN-NEXT: v_mov_b32_e32 v2, s8 614; GCN-NEXT: s_sub_i32 s6, 64, s20 615; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 616; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 617; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 618; GCN-NEXT: s_sub_i32 s4, s20, 64 619; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 620; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 621; GCN-NEXT: s_ashr_i64 s[4:5], s[14:15], s4 622; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] 623; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] 624; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] 625; GCN-NEXT: v_mov_b32_e32 v2, s5 626; GCN-NEXT: v_mov_b32_e32 v3, s7 627; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 628; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 629; GCN-NEXT: v_mov_b32_e32 v3, s13 630; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] 631; GCN-NEXT: v_mov_b32_e32 v2, s4 632; GCN-NEXT: v_mov_b32_e32 v3, s6 633; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 634; GCN-NEXT: v_mov_b32_e32 v3, s12 635; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] 636; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 637; GCN-NEXT: s_ashr_i32 s4, s11, 31 638; GCN-NEXT: v_mov_b32_e32 v2, s4 639; GCN-NEXT: v_mov_b32_e32 v3, s3 640; GCN-NEXT: v_mov_b32_e32 v6, s2 641; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 642; GCN-NEXT: s_ashr_i32 s4, s15, 31 643; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc 644; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 645; GCN-NEXT: v_mov_b32_e32 v6, s4 646; GCN-NEXT: v_mov_b32_e32 v7, s3 647; GCN-NEXT: v_mov_b32_e32 v10, s2 648; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] 649; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] 650; GCN-NEXT: v_mov_b32_e32 v10, 16 651; GCN-NEXT: v_mov_b32_e32 v11, 0 652; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 653; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 654; GCN-NEXT: s_endpgm 655 %shift = ashr <2 x i128> %lhs, %rhs 656 store <2 x i128> %shift, <2 x i128> addrspace(1)* null 657 ret void 658} 659 660