1; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s 3 4; SI-LABEL: {{^}}unaligned_load_store_i16_local: 5; SI: ds_read_u8 6; SI: ds_read_u8 7; SI: ds_write_b8 8; SI: ds_write_b8 9; SI: s_endpgm 10define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind { 11 %v = load i16, i16 addrspace(3)* %p, align 1 12 store i16 %v, i16 addrspace(3)* %r, align 1 13 ret void 14} 15 16; SI-LABEL: {{^}}unaligned_load_store_i16_global: 17; SI: buffer_load_ubyte 18; SI: buffer_load_ubyte 19; SI: buffer_store_byte 20; SI: buffer_store_byte 21; SI: s_endpgm 22define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind { 23 %v = load i16, i16 addrspace(1)* %p, align 1 24 store i16 %v, i16 addrspace(1)* %r, align 1 25 ret void 26} 27 28; SI-LABEL: {{^}}unaligned_load_store_i32_local: 29; SI: ds_read_u8 30; SI: ds_read_u8 31; SI: ds_read_u8 32; SI: ds_read_u8 33; SI: ds_write_b8 34; SI: ds_write_b8 35; SI: ds_write_b8 36; SI: ds_write_b8 37; SI: s_endpgm 38define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind { 39 %v = load i32, i32 addrspace(3)* %p, align 1 40 store i32 %v, i32 addrspace(3)* %r, align 1 41 ret void 42} 43 44; SI-LABEL: {{^}}unaligned_load_store_i32_global: 45; SI: buffer_load_ubyte 46; SI: buffer_load_ubyte 47; SI: buffer_load_ubyte 48; SI: buffer_load_ubyte 49; SI: buffer_store_byte 50; SI: buffer_store_byte 51; SI: buffer_store_byte 52; SI: buffer_store_byte 53define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind { 54 %v = load i32, i32 addrspace(1)* %p, align 1 55 store i32 %v, i32 addrspace(1)* %r, align 1 56 ret void 57} 58 59; SI-LABEL: {{^}}unaligned_load_store_i64_local: 60; SI: ds_read_u8 61; SI: ds_read_u8 62; SI: ds_read_u8 63; SI: ds_read_u8 64; SI: ds_read_u8 65; SI: ds_read_u8 66; SI: ds_read_u8 67; SI: ds_read_u8 68; SI: ds_write_b8 69; SI: ds_write_b8 70; SI: ds_write_b8 71; SI: ds_write_b8 72; SI: ds_write_b8 73; SI: ds_write_b8 74; SI: ds_write_b8 75; SI: ds_write_b8 76; SI: s_endpgm 77define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { 78 %v = load i64, i64 addrspace(3)* %p, align 1 79 store i64 %v, i64 addrspace(3)* %r, align 1 80 ret void 81} 82 83; SI-LABEL: {{^}}unaligned_load_store_i64_global: 84; SI: buffer_load_ubyte 85; SI: buffer_load_ubyte 86; SI: buffer_load_ubyte 87; SI: buffer_load_ubyte 88; SI: buffer_load_ubyte 89; SI: buffer_load_ubyte 90; SI: buffer_load_ubyte 91; SI: buffer_load_ubyte 92; SI: buffer_store_byte 93; SI: buffer_store_byte 94; SI: buffer_store_byte 95; SI: buffer_store_byte 96; SI: buffer_store_byte 97; SI: buffer_store_byte 98; SI: buffer_store_byte 99; SI: buffer_store_byte 100define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { 101 %v = load i64, i64 addrspace(1)* %p, align 1 102 store i64 %v, i64 addrspace(1)* %r, align 1 103 ret void 104} 105 106; SI-LABEL: {{^}}unaligned_load_store_v4i32_local: 107; SI: ds_read_u8 108; SI: ds_read_u8 109; SI: ds_read_u8 110; SI: ds_read_u8 111 112; SI: ds_read_u8 113; SI: ds_read_u8 114; SI: ds_read_u8 115; SI: ds_read_u8 116 117; SI: ds_read_u8 118; SI: ds_read_u8 119; SI: ds_read_u8 120; SI: ds_read_u8 121 122; SI: ds_read_u8 123; SI: ds_read_u8 124; SI: ds_read_u8 125; SI: ds_read_u8 126 127; SI: ds_write_b8 128; SI: ds_write_b8 129; SI: ds_write_b8 130; SI: ds_write_b8 131 132; SI: ds_write_b8 133; SI: ds_write_b8 134; SI: ds_write_b8 135; SI: ds_write_b8 136 137; SI: ds_write_b8 138; SI: ds_write_b8 139; SI: ds_write_b8 140; SI: ds_write_b8 141 142; SI: ds_write_b8 143; SI: ds_write_b8 144; SI: ds_write_b8 145; SI: ds_write_b8 146; SI: s_endpgm 147define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind { 148 %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 149 store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 150 ret void 151} 152 153; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded. 154; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global 155; FIXME-SI: buffer_load_ubyte 156; FIXME-SI: buffer_load_ubyte 157; FIXME-SI: buffer_load_ubyte 158; FIXME-SI: buffer_load_ubyte 159; FIXME-SI: buffer_load_ubyte 160; FIXME-SI: buffer_load_ubyte 161; FIXME-SI: buffer_load_ubyte 162; FIXME-SI: buffer_load_ubyte 163; FIXME-SI: buffer_load_ubyte 164; FIXME-SI: buffer_load_ubyte 165; FIXME-SI: buffer_load_ubyte 166; FIXME-SI: buffer_load_ubyte 167; FIXME-SI: buffer_load_ubyte 168; FIXME-SI: buffer_load_ubyte 169; FIXME-SI: buffer_load_ubyte 170; FIXME-SI: buffer_load_ubyte 171define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind { 172 %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 173 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 174 ret void 175} 176 177; SI-LABEL: {{^}}load_lds_i64_align_4: 178; SI: ds_read2_b32 179; SI: s_endpgm 180define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 181 %val = load i64, i64 addrspace(3)* %in, align 4 182 store i64 %val, i64 addrspace(1)* %out, align 8 183 ret void 184} 185 186; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset 187; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 188; SI: s_endpgm 189define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 190 %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 191 %val = load i64, i64 addrspace(3)* %ptr, align 4 192 store i64 %val, i64 addrspace(1)* %out, align 8 193 ret void 194} 195 196; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset: 197; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits 198; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 199; SI: s_endpgm 200define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 201 %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* 202 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 203 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* 204 %val = load i64, i64 addrspace(3)* %ptri64, align 4 205 store i64 %val, i64 addrspace(1)* %out, align 8 206 ret void 207} 208 209; SI-LABEL: {{^}}load_lds_i64_align_1: 210; SI: ds_read_u8 211; SI: ds_read_u8 212; SI: ds_read_u8 213; SI: ds_read_u8 214; SI: ds_read_u8 215; SI: ds_read_u8 216; SI: ds_read_u8 217; SI: ds_read_u8 218; SI: buffer_store_dwordx2 219; SI: s_endpgm 220 221define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 222 %val = load i64, i64 addrspace(3)* %in, align 1 223 store i64 %val, i64 addrspace(1)* %out, align 8 224 ret void 225} 226 227; SI-LABEL: {{^}}store_lds_i64_align_4: 228; SI: ds_write2_b32 229; SI: s_endpgm 230define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { 231 store i64 %val, i64 addrspace(3)* %out, align 4 232 ret void 233} 234 235; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset 236; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 237; SI: s_endpgm 238define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { 239 %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 240 store i64 0, i64 addrspace(3)* %ptr, align 4 241 ret void 242} 243 244; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset: 245; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits 246; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 247; SI: s_endpgm 248define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { 249 %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* 250 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 251 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* 252 store i64 0, i64 addrspace(3)* %out, align 4 253 ret void 254} 255