1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s 3 4; GCN-LABEL: {{^}}extract_vector_elt_v1i8: 5; GCN: s_load_dword [[LOAD:s[0-9]+]] 6; GCN: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] 7; GCN: buffer_store_byte [[V_LOAD]] 8define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { 9 %p0 = extractelement <1 x i8> %foo, i32 0 10 store i8 %p0, i8 addrspace(1)* %out 11 ret void 12} 13 14; GCN-LABEL: {{^}}extract_vector_elt_v2i8: 15; GCN: s_load_dword s 16; GCN-NOT: {{flat|buffer|global}} 17; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 18; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}} 19; GCN-NOT: {{flat|buffer|global}} 20; GCN: buffer_store_byte 21; GCN: buffer_store_byte 22define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 { 23 %p0 = extractelement <2 x i8> %foo, i32 0 24 %p1 = extractelement <2 x i8> %foo, i32 1 25 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 26 store volatile i8 %p1, i8 addrspace(1)* %out 27 store volatile i8 %p0, i8 addrspace(1)* %out1 28 ret void 29} 30 31; GCN-LABEL: {{^}}extract_vector_elt_v3i8: 32; GCN: s_load_dword s 33; GCN-NOT: {{flat|buffer|global}} 34; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 35; GCN-NOT: {{flat|buffer|global}} 36; GCN: buffer_store_byte 37; GCN: buffer_store_byte 38define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 { 39 %p0 = extractelement <3 x i8> %foo, i32 0 40 %p1 = extractelement <3 x i8> %foo, i32 2 41 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 42 store volatile i8 %p1, i8 addrspace(1)* %out 43 store volatile i8 %p0, i8 addrspace(1)* %out1 44 ret void 45} 46 47; GCN-LABEL: {{^}}extract_vector_elt_v4i8: 48; GCN: s_load_dword s 49; GCN-NOT: {{flat|buffer|global}} 50; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 51; GCN-NOT: {{flat|buffer|global}} 52; GCN: buffer_store_byte 53; GCN: buffer_store_byte 54define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 { 55 %p0 = extractelement <4 x i8> %foo, i32 0 56 %p1 = extractelement <4 x i8> %foo, i32 2 57 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 58 store volatile i8 %p1, i8 addrspace(1)* %out 59 store volatile i8 %p0, i8 addrspace(1)* %out1 60 ret void 61} 62 63; GCN-LABEL: {{^}}extract_vector_elt_v8i8: 64; GCN-NOT: {{s|flat|buffer|global}}_load 65; GCN: s_load_dword [[VAL:s[0-9]+]] 66; GCN-NOT: {{s|flat|buffer|global}}_load 67; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16 68; GCN-NOT: {{s|flat|buffer|global}}_load 69; GCN: buffer_store_byte 70; GCN: buffer_store_byte 71define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { 72 %p0 = extractelement <8 x i8> %foo, i32 0 73 %p1 = extractelement <8 x i8> %foo, i32 2 74 store volatile i8 %p1, i8 addrspace(1)* null 75 store volatile i8 %p0, i8 addrspace(1)* null 76 ret void 77} 78 79; GCN-LABEL: {{^}}extract_vector_elt_v16i8: 80; GCN: s_load_dword [[LOAD0:s[0-9]+]] 81; GCN-NOT: {{flat|buffer|global}} 82; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 83; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] 84; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] 85; GCN: buffer_store_byte [[V_ELT2]] 86; GCN: buffer_store_byte [[V_LOAD0]] 87define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 { 88 %p0 = extractelement <16 x i8> %foo, i32 0 89 %p1 = extractelement <16 x i8> %foo, i32 2 90 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 91 store volatile i8 %p1, i8 addrspace(1)* %out 92 store volatile i8 %p0, i8 addrspace(1)* %out1 93 ret void 94} 95 96; GCN-LABEL: {{^}}extract_vector_elt_v32i8: 97; GCN-NOT: {{s|flat|buffer|global}}_load 98; GCN: s_load_dword [[VAL:s[0-9]+]] 99; GCN-NOT: {{s|flat|buffer|global}}_load 100; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16 101; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}} 102; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] 103; GCN: buffer_store_byte [[V_ELT2]] 104; GCN: buffer_store_byte [[V_LOAD0]] 105define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { 106 %p0 = extractelement <32 x i8> %foo, i32 0 107 %p1 = extractelement <32 x i8> %foo, i32 2 108 store volatile i8 %p1, i8 addrspace(1)* null 109 store volatile i8 %p0, i8 addrspace(1)* null 110 ret void 111} 112 113; GCN-LABEL: {{^}}extract_vector_elt_v64i8: 114; GCN: s_load_dword [[LOAD0:s[0-9]+]] 115; GCN-NOT: {{flat|buffer|global}} 116; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 117; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] 118; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] 119; GCN: buffer_store_byte [[V_ELT2]] 120; GCN: buffer_store_byte [[V_LOAD0]] 121define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 { 122 %p0 = extractelement <64 x i8> %foo, i32 0 123 %p1 = extractelement <64 x i8> %foo, i32 2 124 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 125 store volatile i8 %p1, i8 addrspace(1)* %out 126 store volatile i8 %p0, i8 addrspace(1)* %out1 127 ret void 128} 129 130; FIXME: SI generates much worse code from that's a pain to match 131 132; FIXME: 16-bit and 32-bit shift not combined after legalize to to 133; isTypeDesirableForOp in SimplifyDemandedBits 134 135; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: 136; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 137; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c 138; VI-NOT: {{flat|buffer|global}} 139; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] 140; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 141; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]] 142; VI: buffer_store_byte [[ELT]] 143define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 { 144 %elt = extractelement <2 x i8> %foo, i32 %idx 145 store volatile i8 %elt, i8 addrspace(1)* %out 146 ret void 147} 148 149; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: 150; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 151; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c 152; VI-NOT: {{flat|buffer|global}} 153; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 154; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]] 155; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]] 156; VI: buffer_store_byte [[V_ELT]] 157define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 { 158 %p0 = extractelement <3 x i8> %foo, i32 %idx 159 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 160 store volatile i8 %p0, i8 addrspace(1)* %out 161 ret void 162} 163 164; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: 165; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30 166; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 167 168; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 169; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC4]], [[SCALED_IDX]] 170 171; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]] 172; VI: buffer_store_byte [[V_EXTRACT]] 173define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 { 174 %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr 175 %p0 = extractelement <4 x i8> %vec, i32 %idx 176 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 177 store volatile i8 %p0, i8 addrspace(1)* %out 178 ret void 179} 180 181; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8: 182; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10 183; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 184 185; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 186; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]] 187; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]] 188; VI: buffer_store_byte [[V_EXTRACT]] 189define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 { 190 %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr 191 %p0 = extractelement <8 x i8> %vec, i32 %idx 192 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 193 store volatile i8 %p0, i8 addrspace(1)* %out 194 ret void 195} 196 197; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0123: 198; GCN-NOT: {{s|buffer|flat|global}}_load_ 199; GCN: s_load_dword s 200; GCN-NOT: {{s|buffer|flat|global}}_load_ 201; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 202; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 203; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24 204define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { 205 %load = load <8 x i8>, <8 x i8> addrspace(4)* null 206 %elt0 = extractelement <8 x i8> %load, i32 0 207 %elt1 = extractelement <8 x i8> %load, i32 1 208 %elt2 = extractelement <8 x i8> %load, i32 2 209 %elt3 = extractelement <8 x i8> %load, i32 3 210 store volatile i8 %elt0, i8 addrspace(1)* undef, align 1 211 store volatile i8 %elt1, i8 addrspace(1)* undef, align 1 212 store volatile i8 %elt2, i8 addrspace(1)* undef, align 1 213 store volatile i8 %elt3, i8 addrspace(1)* undef, align 1 214 ret void 215} 216 217; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0145: 218; GCN-NOT: {{s|buffer|flat|global}}_load_ 219; GCN: s_load_dwordx2 220; GCN-NOT: {{s|buffer|flat|global}}_load_ 221; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 222; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 223define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { 224 %load = load <8 x i8>, <8 x i8> addrspace(4)* null 225 %elt0 = extractelement <8 x i8> %load, i32 0 226 %elt1 = extractelement <8 x i8> %load, i32 1 227 %elt4 = extractelement <8 x i8> %load, i32 4 228 %elt5 = extractelement <8 x i8> %load, i32 5 229 store volatile i8 %elt0, i8 addrspace(1)* undef, align 1 230 store volatile i8 %elt1, i8 addrspace(1)* undef, align 1 231 store volatile i8 %elt4, i8 addrspace(1)* undef, align 1 232 store volatile i8 %elt5, i8 addrspace(1)* undef, align 1 233 ret void 234} 235 236; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_45: 237; GCN-NOT: {{s|buffer|flat|global}}_load_ 238; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 4{{$}} 239; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}} 240; GCN-NOT: {{s|buffer|flat|global}}_load_ 241; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 242define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { 243 %load = load <8 x i8>, <8 x i8> addrspace(4)* null 244 %elt4 = extractelement <8 x i8> %load, i32 4 245 %elt5 = extractelement <8 x i8> %load, i32 5 246 store volatile i8 %elt4, i8 addrspace(1)* undef, align 1 247 store volatile i8 %elt5, i8 addrspace(1)* undef, align 1 248 ret void 249} 250 251; FIXME: ought to be able to eliminate high half of load 252; GCN-LABEL: {{^}}reduce_load_vector_v16i8_extract_0145: 253; GCN-NOT: {{s|buffer|flat|global}}_load_ 254; GCN: s_load_dwordx4 255; GCN-NOT: {{s|buffer|flat|global}}_load_ 256; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 257; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 258define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 { 259 %load = load <16 x i8>, <16 x i8> addrspace(4)* null 260 %elt0 = extractelement <16 x i8> %load, i32 0 261 %elt1 = extractelement <16 x i8> %load, i32 1 262 %elt4 = extractelement <16 x i8> %load, i32 4 263 %elt5 = extractelement <16 x i8> %load, i32 5 264 store volatile i8 %elt0, i8 addrspace(1)* undef, align 1 265 store volatile i8 %elt1, i8 addrspace(1)* undef, align 1 266 store volatile i8 %elt4, i8 addrspace(1)* undef, align 1 267 store volatile i8 %elt5, i8 addrspace(1)* undef, align 1 268 ret void 269} 270 271attributes #0 = { nounwind } 272