1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3 4declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i1) nounwind 5declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind 6declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(4)* nocapture, i64, i1) nounwind 7 8 9; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: 10; SI-DAG: ds_read_u8 11; SI-DAG: ds_read_u8 12; SI-DAG: ds_read_u8 13; SI-DAG: ds_read_u8 14; SI-DAG: ds_read_u8 15; SI-DAG: ds_read_u8 16; SI-DAG: ds_read_u8 17; SI-DAG: ds_read_u8 18 19; SI-DAG: ds_read_u8 20; SI-DAG: ds_read_u8 21; SI-DAG: ds_read_u8 22; SI-DAG: ds_read_u8 23; SI-DAG: ds_read_u8 24; SI-DAG: ds_read_u8 25; SI-DAG: ds_read_u8 26; SI-DAG: ds_read_u8 27 28; SI-DAG: ds_read_u8 29; SI-DAG: ds_read_u8 30; SI-DAG: ds_read_u8 31; SI-DAG: ds_read_u8 32; SI-DAG: ds_read_u8 33; SI-DAG: ds_read_u8 34; SI-DAG: ds_read_u8 35; SI-DAG: ds_read_u8 36 37; SI-DAG: ds_read_u8 38; SI-DAG: ds_read_u8 39; SI-DAG: ds_read_u8 40; SI-DAG: ds_read_u8 41; SI-DAG: ds_read_u8 42; SI-DAG: ds_read_u8 43; SI-DAG: ds_read_u8 44; SI-DAG: ds_read_u8 45 46; SI-DAG: ds_write_b8 47; SI-DAG: ds_write_b8 48; SI-DAG: ds_write_b8 49; SI-DAG: ds_write_b8 50; SI-DAG: ds_write_b8 51; SI-DAG: ds_write_b8 52; SI-DAG: ds_write_b8 53; SI-DAG: ds_write_b8 54 55; SI-DAG: ds_write_b8 56; SI-DAG: ds_write_b8 57; SI-DAG: ds_write_b8 58; SI-DAG: ds_write_b8 59; SI-DAG: ds_write_b8 60; SI-DAG: ds_write_b8 61; SI-DAG: ds_write_b8 62; SI-DAG: ds_write_b8 63 64; SI-DAG: ds_write_b8 65; SI-DAG: ds_write_b8 66; SI-DAG: ds_write_b8 67; SI-DAG: ds_write_b8 68; SI-DAG: ds_write_b8 69; SI-DAG: ds_write_b8 70; SI-DAG: ds_write_b8 71; SI-DAG: ds_write_b8 72 73; SI-DAG: ds_write_b8 74; SI-DAG: ds_write_b8 75; SI-DAG: ds_write_b8 76; SI-DAG: ds_write_b8 77; SI-DAG: ds_write_b8 78; SI-DAG: ds_write_b8 79; SI-DAG: ds_write_b8 80; SI-DAG: ds_write_b8 81 82; SI: s_endpgm 83define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 84 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 85 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 86 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i1 false) nounwind 87 ret void 88} 89 90; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: 91; SI-DAG: ds_read_u16 92; SI-DAG: ds_read_u16 93; SI-DAG: ds_read_u16 94; SI-DAG: ds_read_u16 95; SI-DAG: ds_read_u16 96; SI-DAG: ds_read_u16 97; SI-DAG: ds_read_u16 98; SI-DAG: ds_read_u16 99 100; SI-DAG: ds_read_u16 101; SI-DAG: ds_read_u16 102; SI-DAG: ds_read_u16 103; SI-DAG: ds_read_u16 104; SI-DAG: ds_read_u16 105; SI-DAG: ds_read_u16 106; SI-DAG: ds_read_u16 107; SI-DAG: ds_read_u16 108 109; SI-DAG: ds_write_b16 110; SI-DAG: ds_write_b16 111; SI-DAG: ds_write_b16 112; SI-DAG: ds_write_b16 113; SI-DAG: ds_write_b16 114; SI-DAG: ds_write_b16 115; SI-DAG: ds_write_b16 116; SI-DAG: ds_write_b16 117 118; SI-DAG: ds_write_b16 119; SI-DAG: ds_write_b16 120; SI-DAG: ds_write_b16 121; SI-DAG: ds_write_b16 122; SI-DAG: ds_write_b16 123; SI-DAG: ds_write_b16 124; SI-DAG: ds_write_b16 125; SI-DAG: ds_write_b16 126 127; SI: s_endpgm 128define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 129 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 130 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 131 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 2 %bcout, i8 addrspace(3)* align 2 %bcin, i32 32, i1 false) nounwind 132 ret void 133} 134 135; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: 136; SI: ds_read2_b32 137; SI: ds_read2_b32 138; SI: ds_read2_b32 139; SI: ds_read2_b32 140 141; SI: ds_write2_b32 142; SI: ds_write2_b32 143; SI: ds_write2_b32 144; SI: ds_write2_b32 145 146; SI: s_endpgm 147define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 148 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 149 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 150 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 4 %bcout, i8 addrspace(3)* align 4 %bcin, i32 32, i1 false) nounwind 151 ret void 152} 153 154; FIXME: Use 64-bit ops 155; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: 156 157; SI: ds_read2_b64 158; SI: ds_read2_b64 159 160; SI: ds_write2_b64 161; SI: ds_write2_b64 162 163; SI-DAG: s_endpgm 164define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 165 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 166 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 167 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 8 %bcout, i8 addrspace(3)* align 8 %bcin, i32 32, i1 false) nounwind 168 ret void 169} 170 171; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1: 172; SI-DAG: buffer_load_ubyte 173; SI-DAG: buffer_store_byte 174; SI-DAG: buffer_load_ubyte 175; SI-DAG: buffer_store_byte 176; SI-DAG: buffer_load_ubyte 177; SI-DAG: buffer_store_byte 178; SI-DAG: buffer_load_ubyte 179; SI-DAG: buffer_store_byte 180; SI-DAG: buffer_load_ubyte 181; SI-DAG: buffer_store_byte 182; SI-DAG: buffer_load_ubyte 183; SI-DAG: buffer_store_byte 184; SI-DAG: buffer_load_ubyte 185; SI-DAG: buffer_store_byte 186; SI-DAG: buffer_load_ubyte 187; SI-DAG: buffer_store_byte 188 189; SI-DAG: buffer_load_ubyte 190; SI-DAG: buffer_store_byte 191; SI-DAG: buffer_load_ubyte 192; SI-DAG: buffer_store_byte 193; SI-DAG: buffer_load_ubyte 194; SI-DAG: buffer_store_byte 195; SI-DAG: buffer_load_ubyte 196; SI-DAG: buffer_store_byte 197; SI-DAG: buffer_load_ubyte 198; SI-DAG: buffer_store_byte 199; SI-DAG: buffer_load_ubyte 200; SI-DAG: buffer_store_byte 201; SI-DAG: buffer_load_ubyte 202; SI-DAG: buffer_store_byte 203; SI-DAG: buffer_load_ubyte 204; SI-DAG: buffer_store_byte 205 206; SI-DAG: buffer_load_ubyte 207; SI-DAG: buffer_store_byte 208; SI-DAG: buffer_load_ubyte 209; SI-DAG: buffer_store_byte 210; SI-DAG: buffer_load_ubyte 211; SI-DAG: buffer_store_byte 212; SI-DAG: buffer_load_ubyte 213; SI-DAG: buffer_store_byte 214; SI-DAG: buffer_load_ubyte 215; SI-DAG: buffer_store_byte 216; SI-DAG: buffer_load_ubyte 217; SI-DAG: buffer_store_byte 218; SI-DAG: buffer_load_ubyte 219; SI-DAG: buffer_store_byte 220; SI-DAG: buffer_load_ubyte 221; SI-DAG: buffer_store_byte 222 223; SI-DAG: buffer_load_ubyte 224; SI-DAG: buffer_store_byte 225; SI-DAG: buffer_load_ubyte 226; SI-DAG: buffer_store_byte 227; SI-DAG: buffer_load_ubyte 228; SI-DAG: buffer_store_byte 229; SI-DAG: buffer_load_ubyte 230; SI-DAG: buffer_store_byte 231; SI-DAG: buffer_load_ubyte 232; SI-DAG: buffer_store_byte 233; SI-DAG: buffer_load_ubyte 234; SI-DAG: buffer_store_byte 235; SI-DAG: buffer_load_ubyte 236; SI-DAG: buffer_store_byte 237; SI-DAG: buffer_load_ubyte 238; SI-DAG: buffer_store_byte 239 240; SI: s_endpgm 241define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 242 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 243 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 244 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i1 false) nounwind 245 ret void 246} 247 248; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2: 249; SI-DAG: buffer_load_ushort 250; SI-DAG: buffer_load_ushort 251; SI-DAG: buffer_load_ushort 252; SI-DAG: buffer_load_ushort 253; SI-DAG: buffer_load_ushort 254; SI-DAG: buffer_load_ushort 255; SI-DAG: buffer_load_ushort 256; SI-DAG: buffer_load_ushort 257; SI-DAG: buffer_load_ushort 258; SI-DAG: buffer_load_ushort 259; SI-DAG: buffer_load_ushort 260; SI-DAG: buffer_load_ushort 261; SI-DAG: buffer_load_ushort 262; SI-DAG: buffer_load_ushort 263; SI-DAG: buffer_load_ushort 264; SI-DAG: buffer_load_ushort 265 266; SI-DAG: buffer_store_short 267; SI-DAG: buffer_store_short 268; SI-DAG: buffer_store_short 269; SI-DAG: buffer_store_short 270; SI-DAG: buffer_store_short 271; SI-DAG: buffer_store_short 272; SI-DAG: buffer_store_short 273; SI-DAG: buffer_store_short 274; SI-DAG: buffer_store_short 275; SI-DAG: buffer_store_short 276; SI-DAG: buffer_store_short 277; SI-DAG: buffer_store_short 278; SI-DAG: buffer_store_short 279; SI-DAG: buffer_store_short 280; SI-DAG: buffer_store_short 281; SI-DAG: buffer_store_short 282 283; SI: s_endpgm 284define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 285 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 286 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 287 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %bcout, i8 addrspace(1)* align 2 %bcin, i64 32, i1 false) nounwind 288 ret void 289} 290 291; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4: 292; SI: buffer_load_dwordx4 293; SI: buffer_load_dwordx4 294; SI: buffer_store_dwordx4 295; SI: buffer_store_dwordx4 296; SI: s_endpgm 297define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 298 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 299 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 300 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %bcout, i8 addrspace(1)* align 4 %bcin, i64 32, i1 false) nounwind 301 ret void 302} 303 304; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8: 305; SI: buffer_load_dwordx4 306; SI: buffer_load_dwordx4 307; SI: buffer_store_dwordx4 308; SI: buffer_store_dwordx4 309; SI: s_endpgm 310define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 311 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 312 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 313 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 8 %bcout, i8 addrspace(1)* align 8 %bcin, i64 32, i1 false) nounwind 314 ret void 315} 316 317; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16: 318; SI: buffer_load_dwordx4 319; SI: buffer_load_dwordx4 320; SI: buffer_store_dwordx4 321; SI: buffer_store_dwordx4 322; SI: s_endpgm 323define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 324 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 325 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 326 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 16 %bcout, i8 addrspace(1)* align 16 %bcin, i64 32, i1 false) nounwind 327 ret void 328} 329 330; Test shouldConvertConstantLoadToIntImm 331@hello.align4 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 4 332@hello.align1 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 1 333 334; FUNC-LABEL: {{^}}test_memcpy_const_string_align4: 335; SI: s_getpc_b64 336; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4@rel32@lo+20 337; SI: s_addc_u32 338; SI-DAG: s_load_dwordx4 339; SI-DAG: s_load_dwordx4 340; SI-DAG: s_load_dwordx2 341; SI-DAG: buffer_store_dwordx4 342; SI-DAG: buffer_store_dwordx4 343define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind { 344 %str = bitcast [16 x i8] addrspace(4)* @hello.align4 to i8 addrspace(4)* 345 call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* align 4 %out, i8 addrspace(4)* align 4 %str, i64 32, i1 false) 346 ret void 347} 348 349; FUNC-LABEL: {{^}}test_memcpy_const_string_align1: 350; SI-NOT: buffer_load 351; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x 352; SI: buffer_store_byte 353; SI: buffer_store_byte 354; SI: buffer_store_byte 355; SI: buffer_store_byte 356; SI: buffer_store_byte 357; SI: buffer_store_byte 358; SI: buffer_store_byte 359; SI: buffer_store_byte 360; SI: buffer_store_byte 361; SI: buffer_store_byte 362; SI: buffer_store_byte 363; SI: buffer_store_byte 364; SI: buffer_store_byte 365; SI: buffer_store_byte 366; SI: buffer_store_byte 367; SI: buffer_store_byte 368define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind { 369 %str = bitcast [16 x i8] addrspace(4)* @hello.align1 to i8 addrspace(4)* 370 call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(4)* %str, i64 32, i1 false) 371 ret void 372} 373