1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s 3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s 4 5declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i1) #1 6declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1 7declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #1 8declare void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture readonly, i32, i1) #1 9declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1 10 11declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i1) #1 12declare void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1 13declare void @llvm.memmove.p5i8.p5i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture readonly, i32, i1) #1 14 15declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i1) #1 16 17; Test the upper bound for sizes to leave 18define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 19; MAX1024-LABEL: @max_size_small_static_memcpy_caller0( 20; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 addrspace(1)* [[SRC:%.*]], i64 1024, i1 false) 21; MAX1024-NEXT: ret void 22; 23; ALL-LABEL: @max_size_small_static_memcpy_caller0( 24; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 25; ALL-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 26; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 27; ALL: load-store-loop: 28; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 29; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 30; ALL-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 1 31; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 32; ALL-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 1 33; ALL-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 34; ALL-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 35; ALL-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 36; ALL: memcpy-split: 37; ALL-NEXT: ret void 38; 39 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false) 40 ret void 41} 42 43; Smallest static size which will be expanded 44define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 45; OPT-LABEL: @min_size_large_static_memcpy_caller0( 46; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 47; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 48; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 49; OPT: load-store-loop: 50; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 51; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 52; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 1 53; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 54; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 1 55; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 56; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 57; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 58; OPT: memcpy-split: 59; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 60; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1024 61; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 1 62; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 63; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1024 64; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 1 65; OPT-NEXT: ret void 66; 67 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i1 false) 68 ret void 69} 70 71define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 72; MAX1024-LABEL: @max_size_small_static_memmove_caller0( 73; MAX1024-NEXT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 addrspace(1)* [[SRC:%.*]], i64 1024, i1 false) 74; MAX1024-NEXT: ret void 75; 76; ALL-LABEL: @max_size_small_static_memmove_caller0( 77; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult i8 addrspace(1)* [[SRC:%.*]], [[DST:%.*]] 78; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1024, 0 79; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]] 80; ALL: copy_backwards: 81; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]] 82; ALL: copy_backwards_loop: 83; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1024, [[COPY_BACKWARDS]] ] 84; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1 85; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]] 86; ALL-NEXT: [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1 87; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]] 88; ALL-NEXT: store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]], align 1 89; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0 90; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]] 91; ALL: copy_forward: 92; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]] 93; ALL: copy_forward_loop: 94; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ] 95; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]] 96; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]], align 1 97; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]] 98; ALL-NEXT: store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]], align 1 99; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1 100; ALL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1024 101; ALL-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]] 102; ALL: memmove_done: 103; ALL-NEXT: ret void 104; 105 call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false) 106 ret void 107} 108 109define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 110; OPT-LABEL: @min_size_large_static_memmove_caller0( 111; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult i8 addrspace(1)* [[SRC:%.*]], [[DST:%.*]] 112; OPT-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1025, 0 113; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]] 114; OPT: copy_backwards: 115; OPT-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]] 116; OPT: copy_backwards_loop: 117; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ] 118; OPT-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1 119; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]] 120; OPT-NEXT: [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1 121; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]] 122; OPT-NEXT: store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]], align 1 123; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0 124; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]] 125; OPT: copy_forward: 126; OPT-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]] 127; OPT: copy_forward_loop: 128; OPT-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ] 129; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]] 130; OPT-NEXT: [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]], align 1 131; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]] 132; OPT-NEXT: store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]], align 1 133; OPT-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1 134; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025 135; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]] 136; OPT: memmove_done: 137; OPT-NEXT: ret void 138; 139 call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i1 false) 140 ret void 141} 142 143define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { 144; MAX1024-LABEL: @max_size_small_static_memset_caller0( 145; MAX1024-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 1024, i1 false) 146; MAX1024-NEXT: ret void 147; 148; ALL-LABEL: @max_size_small_static_memset_caller0( 149; ALL-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] 150; ALL: loadstoreloop: 151; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] 152; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] 153; ALL-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]], align 1 154; ALL-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 155; ALL-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024 156; ALL-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] 157; ALL: split: 158; ALL-NEXT: ret void 159; 160 call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i1 false) 161 ret void 162} 163 164define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { 165; OPT-LABEL: @min_size_large_static_memset_caller0( 166; OPT-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] 167; OPT: loadstoreloop: 168; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] 169; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] 170; OPT-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]], align 1 171; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 172; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025 173; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] 174; OPT: split: 175; OPT-NEXT: ret void 176; 177 call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i1 false) 178 ret void 179} 180 181define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { 182; OPT-LABEL: @variable_memcpy_caller0( 183; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 184; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 185; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16 186; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16 187; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]] 188; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0 189; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 190; OPT: loop-memcpy-expansion: 191; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 192; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 193; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1 194; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 195; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1 196; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1 197; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]] 198; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 199; OPT: loop-memcpy-residual: 200; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 201; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 202; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 203; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 204; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]] 205; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1 206; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]] 207; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1 208; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 209; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]] 210; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 211; OPT: post-loop-memcpy-expansion: 212; OPT-NEXT: ret void 213; OPT: loop-memcpy-residual-header: 214; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0 215; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 216; 217 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i1 false) 218 ret void 219} 220 221define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { 222; OPT-LABEL: @variable_memcpy_caller1( 223; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 224; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 225; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16 226; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16 227; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]] 228; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0 229; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 230; OPT: loop-memcpy-expansion: 231; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 232; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 233; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1 234; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 235; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1 236; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1 237; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]] 238; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 239; OPT: loop-memcpy-residual: 240; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 241; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 242; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 243; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 244; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]] 245; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1 246; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]] 247; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1 248; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 249; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]] 250; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 251; OPT: post-loop-memcpy-expansion: 252; OPT-NEXT: ret void 253; OPT: loop-memcpy-residual-header: 254; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0 255; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 256; 257 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i1 false) 258 ret void 259} 260 261define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 { 262; OPT-LABEL: @memcpy_multi_use_one_function( 263; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 264; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)* 265; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16 266; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16 267; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]] 268; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0 269; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]] 270; OPT: loop-memcpy-expansion2: 271; OPT-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ] 272; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX3]] 273; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1 274; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX3]] 275; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1 276; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX3]], 1 277; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]] 278; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION2]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5]] 279; OPT: loop-memcpy-residual4: 280; OPT-NEXT: [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL4:%.*]] ] 281; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 282; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 283; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX6]] 284; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]] 285; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1 286; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]] 287; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1 288; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX6]], 1 289; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]] 290; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]] 291; OPT: post-loop-memcpy-expansion1: 292; OPT-NEXT: [[TMP20:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to <4 x i32> addrspace(1)* 293; OPT-NEXT: [[TMP21:%.*]] = bitcast i8 addrspace(1)* [[DST1:%.*]] to <4 x i32> addrspace(1)* 294; OPT-NEXT: [[TMP22:%.*]] = udiv i64 [[M:%.*]], 16 295; OPT-NEXT: [[TMP23:%.*]] = urem i64 [[M]], 16 296; OPT-NEXT: [[TMP24:%.*]] = sub i64 [[M]], [[TMP23]] 297; OPT-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP22]], 0 298; OPT-NEXT: br i1 [[TMP25]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 299; OPT: loop-memcpy-expansion: 300; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 301; OPT-NEXT: [[TMP26:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP20]], i64 [[LOOP_INDEX]] 302; OPT-NEXT: [[TMP27:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP26]], align 1 303; OPT-NEXT: [[TMP28:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP21]], i64 [[LOOP_INDEX]] 304; OPT-NEXT: store <4 x i32> [[TMP27]], <4 x i32> addrspace(1)* [[TMP28]], align 1 305; OPT-NEXT: [[TMP29]] = add i64 [[LOOP_INDEX]], 1 306; OPT-NEXT: [[TMP30:%.*]] = icmp ult i64 [[TMP29]], [[TMP22]] 307; OPT-NEXT: br i1 [[TMP30]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 308; OPT: loop-memcpy-residual: 309; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP37:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 310; OPT-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i8 addrspace(1)* 311; OPT-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i8 addrspace(1)* 312; OPT-NEXT: [[TMP33:%.*]] = add i64 [[TMP24]], [[RESIDUAL_LOOP_INDEX]] 313; OPT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP31]], i64 [[TMP33]] 314; OPT-NEXT: [[TMP35:%.*]] = load i8, i8 addrspace(1)* [[TMP34]], align 1 315; OPT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP32]], i64 [[TMP33]] 316; OPT-NEXT: store i8 [[TMP35]], i8 addrspace(1)* [[TMP36]], align 1 317; OPT-NEXT: [[TMP37]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 318; OPT-NEXT: [[TMP38:%.*]] = icmp ult i64 [[TMP37]], [[TMP23]] 319; OPT-NEXT: br i1 [[TMP38]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 320; OPT: post-loop-memcpy-expansion: 321; OPT-NEXT: ret void 322; OPT: loop-memcpy-residual-header: 323; OPT-NEXT: [[TMP39:%.*]] = icmp ne i64 [[TMP23]], 0 324; OPT-NEXT: br i1 [[TMP39]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 325; OPT: loop-memcpy-residual-header5: 326; OPT-NEXT: [[TMP40:%.*]] = icmp ne i64 [[TMP4]], 0 327; OPT-NEXT: br i1 [[TMP40]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]] 328; 329 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i1 false) 330 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i1 false) 331 ret void 332} 333 334define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { 335; OPT-LABEL: @memcpy_alt_type( 336; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)* 337; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <2 x i32> addrspace(1)* 338; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8 339; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8 340; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]] 341; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0 342; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 343; OPT: loop-memcpy-expansion: 344; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 345; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]] 346; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 1 347; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP2]], i32 [[LOOP_INDEX]] 348; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(1)* [[TMP9]], align 1 349; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1 350; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]] 351; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 352; OPT: loop-memcpy-residual: 353; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 354; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)* 355; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 356; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 357; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]] 358; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 1 359; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i32 [[TMP14]] 360; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1 361; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 362; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]] 363; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 364; OPT: post-loop-memcpy-expansion: 365; OPT-NEXT: ret void 366; OPT: loop-memcpy-residual-header: 367; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0 368; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 369; 370 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i1 false) 371 ret void 372} 373 374; One of the uses in the function should be expanded, the other left alone. 375define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 { 376; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small( 377; MAX1024-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 378; MAX1024-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)* 379; MAX1024-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16 380; MAX1024-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16 381; MAX1024-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]] 382; MAX1024-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0 383; MAX1024-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 384; MAX1024: loop-memcpy-expansion: 385; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 386; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 387; MAX1024-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1 388; MAX1024-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 389; MAX1024-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1 390; MAX1024-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1 391; MAX1024-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]] 392; MAX1024-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 393; MAX1024: loop-memcpy-residual: 394; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 395; MAX1024-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 396; MAX1024-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 397; MAX1024-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 398; MAX1024-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]] 399; MAX1024-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1 400; MAX1024-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]] 401; MAX1024-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1 402; MAX1024-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 403; MAX1024-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]] 404; MAX1024-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 405; MAX1024: post-loop-memcpy-expansion: 406; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* [[DST1:%.*]], i8 addrspace(1)* [[SRC]], i64 102, i1 false) 407; MAX1024-NEXT: ret void 408; MAX1024: loop-memcpy-residual-header: 409; MAX1024-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0 410; MAX1024-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 411; 412; ALL-LABEL: @memcpy_multi_use_one_function_keep_small( 413; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 414; ALL-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)* 415; ALL-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16 416; ALL-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16 417; ALL-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]] 418; ALL-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0 419; ALL-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 420; ALL: loop-memcpy-expansion: 421; ALL-NEXT: [[LOOP_INDEX1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 422; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX1]] 423; ALL-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1 424; ALL-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX1]] 425; ALL-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1 426; ALL-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX1]], 1 427; ALL-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]] 428; ALL-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 429; ALL: loop-memcpy-residual: 430; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 431; ALL-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 432; ALL-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 433; ALL-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 434; ALL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]] 435; ALL-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1 436; ALL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]] 437; ALL-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1 438; ALL-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 439; ALL-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]] 440; ALL-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 441; ALL: post-loop-memcpy-expansion: 442; ALL-NEXT: [[TMP20:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to <4 x i32> addrspace(1)* 443; ALL-NEXT: [[TMP21:%.*]] = bitcast i8 addrspace(1)* [[DST1:%.*]] to <4 x i32> addrspace(1)* 444; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 445; ALL: load-store-loop: 446; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION]] ], [ [[TMP25:%.*]], [[LOAD_STORE_LOOP]] ] 447; ALL-NEXT: [[TMP22:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP20]], i64 [[LOOP_INDEX]] 448; ALL-NEXT: [[TMP23:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP22]], align 1 449; ALL-NEXT: [[TMP24:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP21]], i64 [[LOOP_INDEX]] 450; ALL-NEXT: store <4 x i32> [[TMP23]], <4 x i32> addrspace(1)* [[TMP24]], align 1 451; ALL-NEXT: [[TMP25]] = add i64 [[LOOP_INDEX]], 1 452; ALL-NEXT: [[TMP26:%.*]] = icmp ult i64 [[TMP25]], 6 453; ALL-NEXT: br i1 [[TMP26]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 454; ALL: memcpy-split: 455; ALL-NEXT: [[TMP27:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i32 addrspace(1)* 456; ALL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP27]], i64 24 457; ALL-NEXT: [[TMP29:%.*]] = load i32, i32 addrspace(1)* [[TMP28]], align 1 458; ALL-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i32 addrspace(1)* 459; ALL-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP30]], i64 24 460; ALL-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[TMP31]], align 1 461; ALL-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i16 addrspace(1)* 462; ALL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP32]], i64 50 463; ALL-NEXT: [[TMP34:%.*]] = load i16, i16 addrspace(1)* [[TMP33]], align 1 464; ALL-NEXT: [[TMP35:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i16 addrspace(1)* 465; ALL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP35]], i64 50 466; ALL-NEXT: store i16 [[TMP34]], i16 addrspace(1)* [[TMP36]], align 1 467; ALL-NEXT: ret void 468; ALL: loop-memcpy-residual-header: 469; ALL-NEXT: [[TMP37:%.*]] = icmp ne i64 [[TMP4]], 0 470; ALL-NEXT: br i1 [[TMP37]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 471; 472 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i1 false) 473 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i1 false) 474 ret void 475} 476 477define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 478; OPT-LABEL: @memcpy_global_align4_global_align4_1028( 479; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 480; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 481; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 482; OPT: load-store-loop: 483; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 484; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 485; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 486; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 487; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 488; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 489; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 490; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 491; OPT: memcpy-split: 492; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)* 493; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP8]], i64 256 494; OPT-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(1)* [[TMP9]], align 4 495; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)* 496; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP11]], i64 256 497; OPT-NEXT: store i32 [[TMP10]], i32 addrspace(1)* [[TMP12]], align 4 498; OPT-NEXT: ret void 499; 500 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1028, i1 false) 501 ret void 502} 503 504define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 505; OPT-LABEL: @memcpy_global_align4_global_align4_1025( 506; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 507; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 508; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 509; OPT: load-store-loop: 510; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 511; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 512; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 513; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 514; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 515; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 516; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 517; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 518; OPT: memcpy-split: 519; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 520; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1024 521; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 4 522; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 523; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1024 524; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 4 525; OPT-NEXT: ret void 526; 527 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1025, i1 false) 528 ret void 529} 530 531define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 532; OPT-LABEL: @memcpy_global_align4_global_align4_1026( 533; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 534; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 535; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 536; OPT: load-store-loop: 537; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 538; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 539; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 540; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 541; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 542; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 543; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 544; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 545; OPT: memcpy-split: 546; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)* 547; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP8]], i64 512 548; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(1)* [[TMP9]], align 4 549; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)* 550; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP11]], i64 512 551; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(1)* [[TMP12]], align 4 552; OPT-NEXT: ret void 553; 554 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1026, i1 false) 555 ret void 556} 557 558define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 559; OPT-LABEL: @memcpy_global_align4_global_align4_1032( 560; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 561; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 562; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 563; OPT: load-store-loop: 564; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 565; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 566; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 567; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 568; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 569; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 570; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 571; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 572; OPT: memcpy-split: 573; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)* 574; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128 575; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4 576; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)* 577; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128 578; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4 579; OPT-NEXT: ret void 580; 581 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1032, i1 false) 582 ret void 583} 584 585define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 586; OPT-LABEL: @memcpy_global_align4_global_align4_1034( 587; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 588; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 589; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 590; OPT: load-store-loop: 591; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 592; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 593; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 594; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 595; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 596; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 597; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 598; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 599; OPT: memcpy-split: 600; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)* 601; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128 602; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4 603; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)* 604; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128 605; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4 606; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)* 607; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP13]], i64 516 608; OPT-NEXT: [[TMP15:%.*]] = load i16, i16 addrspace(1)* [[TMP14]], align 4 609; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)* 610; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP16]], i64 516 611; OPT-NEXT: store i16 [[TMP15]], i16 addrspace(1)* [[TMP17]], align 4 612; OPT-NEXT: ret void 613; 614 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1034, i1 false) 615 ret void 616} 617 618define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 619; OPT-LABEL: @memcpy_global_align4_global_align4_1035( 620; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 621; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 622; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 623; OPT: load-store-loop: 624; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 625; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 626; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 627; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 628; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 629; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 630; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 631; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 632; OPT: memcpy-split: 633; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)* 634; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128 635; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4 636; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)* 637; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128 638; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4 639; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)* 640; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP13]], i64 516 641; OPT-NEXT: [[TMP15:%.*]] = load i16, i16 addrspace(1)* [[TMP14]], align 4 642; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)* 643; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP16]], i64 516 644; OPT-NEXT: store i16 [[TMP15]], i16 addrspace(1)* [[TMP17]], align 4 645; OPT-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 646; OPT-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP18]], i64 1034 647; OPT-NEXT: [[TMP20:%.*]] = load i8, i8 addrspace(1)* [[TMP19]], align 2 648; OPT-NEXT: [[TMP21:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 649; OPT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP21]], i64 1034 650; OPT-NEXT: store i8 [[TMP20]], i8 addrspace(1)* [[TMP22]], align 2 651; OPT-NEXT: ret void 652; 653 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1035, i1 false) 654 ret void 655} 656 657define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 658; OPT-LABEL: @memcpy_global_align4_global_align4_1036( 659; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 660; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 661; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 662; OPT: load-store-loop: 663; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 664; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 665; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 666; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 667; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 668; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 669; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 670; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 671; OPT: memcpy-split: 672; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)* 673; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128 674; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4 675; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)* 676; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128 677; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4 678; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)* 679; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP13]], i64 258 680; OPT-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[TMP14]], align 4 681; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)* 682; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP16]], i64 258 683; OPT-NEXT: store i32 [[TMP15]], i32 addrspace(1)* [[TMP17]], align 4 684; OPT-NEXT: ret void 685; 686 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1036, i1 false) 687 ret void 688} 689 690define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 691; OPT-LABEL: @memcpy_global_align4_global_align4_1039( 692; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 693; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 694; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 695; OPT: load-store-loop: 696; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 697; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 698; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 699; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 700; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 701; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 702; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 703; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 704; OPT: memcpy-split: 705; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)* 706; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128 707; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4 708; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)* 709; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128 710; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4 711; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)* 712; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP13]], i64 258 713; OPT-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[TMP14]], align 4 714; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)* 715; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP16]], i64 258 716; OPT-NEXT: store i32 [[TMP15]], i32 addrspace(1)* [[TMP17]], align 4 717; OPT-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)* 718; OPT-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP18]], i64 518 719; OPT-NEXT: [[TMP20:%.*]] = load i16, i16 addrspace(1)* [[TMP19]], align 4 720; OPT-NEXT: [[TMP21:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)* 721; OPT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP21]], i64 518 722; OPT-NEXT: store i16 [[TMP20]], i16 addrspace(1)* [[TMP22]], align 4 723; OPT-NEXT: [[TMP23:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 724; OPT-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP23]], i64 1038 725; OPT-NEXT: [[TMP25:%.*]] = load i8, i8 addrspace(1)* [[TMP24]], align 2 726; OPT-NEXT: [[TMP26:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 727; OPT-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP26]], i64 1038 728; OPT-NEXT: store i8 [[TMP25]], i8 addrspace(1)* [[TMP27]], align 2 729; OPT-NEXT: ret void 730; 731 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1039, i1 false) 732 ret void 733} 734 735define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 736; OPT-LABEL: @memcpy_global_align2_global_align2_1039( 737; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)* 738; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)* 739; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 740; OPT: load-store-loop: 741; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 742; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 743; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2 744; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 745; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2 746; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 747; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 519 748; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 749; OPT: memcpy-split: 750; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)* 751; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1038 752; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2 753; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)* 754; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1038 755; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2 756; OPT-NEXT: ret void 757; 758 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 2 %src, i64 1039, i1 false) 759 ret void 760} 761 762define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 763; OPT-LABEL: @memcpy_global_align4_global_align4_1027( 764; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 765; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 766; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 767; OPT: load-store-loop: 768; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 769; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 770; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 771; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 772; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 773; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 774; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64 775; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 776; OPT: memcpy-split: 777; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)* 778; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP8]], i64 512 779; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(1)* [[TMP9]], align 4 780; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)* 781; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP11]], i64 512 782; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(1)* [[TMP12]], align 4 783; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 784; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 1026 785; OPT-NEXT: [[TMP15:%.*]] = load i8, i8 addrspace(1)* [[TMP14]], align 2 786; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 787; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP16]], i64 1026 788; OPT-NEXT: store i8 [[TMP15]], i8 addrspace(1)* [[TMP17]], align 2 789; OPT-NEXT: ret void 790; 791 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1027, i1 false) 792 ret void 793} 794 795define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 796; OPT-LABEL: @memcpy_global_align2_global_align4_1027( 797; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)* 798; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)* 799; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 800; OPT: load-store-loop: 801; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 802; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 803; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2 804; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 805; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2 806; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 807; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 513 808; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 809; OPT: memcpy-split: 810; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)* 811; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1026 812; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2 813; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)* 814; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1026 815; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2 816; OPT-NEXT: ret void 817; 818 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 4 %src, i64 1027, i1 false) 819 ret void 820} 821 822define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 823; OPT-LABEL: @memcpy_global_align4_global_align2_1027( 824; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)* 825; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)* 826; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 827; OPT: load-store-loop: 828; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 829; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 830; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2 831; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 832; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2 833; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 834; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 513 835; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 836; OPT: memcpy-split: 837; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)* 838; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1026 839; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2 840; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)* 841; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1026 842; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2 843; OPT-NEXT: ret void 844; 845 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 2 %src, i64 1027, i1 false) 846 ret void 847} 848 849define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 { 850; OPT-LABEL: @memcpy_private_align4_private_align4_1027( 851; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)* 852; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)* 853; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 854; OPT: load-store-loop: 855; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 856; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]] 857; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 4 858; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]] 859; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 4 860; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1 861; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64 862; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 863; OPT: memcpy-split: 864; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)* 865; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512 866; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 4 867; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)* 868; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512 869; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 4 870; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)* 871; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026 872; OPT-NEXT: [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 2 873; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)* 874; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026 875; OPT-NEXT: store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 2 876; OPT-NEXT: ret void 877; 878 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false) 879 ret void 880} 881 882define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 { 883; OPT-LABEL: @memcpy_private_align2_private_align4_1027( 884; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)* 885; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)* 886; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 887; OPT: load-store-loop: 888; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 889; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]] 890; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2 891; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]] 892; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2 893; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1 894; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513 895; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 896; OPT: memcpy-split: 897; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)* 898; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026 899; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2 900; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)* 901; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026 902; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2 903; OPT-NEXT: ret void 904; 905 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 2 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false) 906 ret void 907} 908 909define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 { 910; OPT-LABEL: @memcpy_private_align1_private_align4_1027( 911; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)* 912; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)* 913; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 914; OPT: load-store-loop: 915; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 916; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]] 917; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 4 918; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]] 919; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 1 920; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1 921; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64 922; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 923; OPT: memcpy-split: 924; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)* 925; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512 926; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 4 927; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)* 928; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512 929; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 1 930; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)* 931; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026 932; OPT-NEXT: [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 2 933; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)* 934; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026 935; OPT-NEXT: store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 1 936; OPT-NEXT: ret void 937; 938 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 1 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false) 939 ret void 940} 941 942define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 { 943; OPT-LABEL: @memcpy_private_align4_private_align2_1027( 944; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)* 945; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)* 946; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 947; OPT: load-store-loop: 948; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 949; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]] 950; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2 951; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]] 952; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2 953; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1 954; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513 955; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 956; OPT: memcpy-split: 957; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)* 958; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026 959; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2 960; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)* 961; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026 962; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2 963; OPT-NEXT: ret void 964; 965 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 2 %src, i32 1027, i1 false) 966 ret void 967} 968 969define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 { 970; OPT-LABEL: @memcpy_private_align4_private_align1_1027( 971; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)* 972; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)* 973; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 974; OPT: load-store-loop: 975; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 976; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]] 977; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 1 978; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]] 979; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 4 980; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1 981; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64 982; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 983; OPT: memcpy-split: 984; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)* 985; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512 986; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 1 987; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)* 988; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512 989; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 4 990; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)* 991; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026 992; OPT-NEXT: [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 1 993; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)* 994; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026 995; OPT-NEXT: store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 2 996; OPT-NEXT: ret void 997; 998 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 1 %src, i32 1027, i1 false) 999 ret void 1000} 1001 1002define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 { 1003; OPT-LABEL: @memcpy_private_align2_private_align2_1027( 1004; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)* 1005; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)* 1006; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 1007; OPT: load-store-loop: 1008; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 1009; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]] 1010; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2 1011; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]] 1012; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2 1013; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1 1014; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513 1015; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 1016; OPT: memcpy-split: 1017; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)* 1018; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026 1019; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2 1020; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)* 1021; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026 1022; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2 1023; OPT-NEXT: ret void 1024; 1025 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 2 %dst, i8 addrspace(5)* align 2 %src, i32 1027, i1 false) 1026 ret void 1027} 1028 1029define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { 1030; OPT-LABEL: @memcpy_global_align4_global_align4_variable( 1031; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 1032; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 1033; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16 1034; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16 1035; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]] 1036; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0 1037; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1038; OPT: loop-memcpy-expansion: 1039; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1040; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 1041; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 4 1042; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 1043; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 4 1044; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1 1045; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]] 1046; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1047; OPT: loop-memcpy-residual: 1048; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1049; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 1050; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 1051; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 1052; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]] 1053; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 4 1054; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]] 1055; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 4 1056; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 1057; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]] 1058; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1059; OPT: post-loop-memcpy-expansion: 1060; OPT-NEXT: ret void 1061; OPT: loop-memcpy-residual-header: 1062; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0 1063; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1064; 1065 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 %n, i1 false) 1066 ret void 1067} 1068 1069define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { 1070; OPT-LABEL: @memcpy_global_align2_global_align2_variable( 1071; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)* 1072; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)* 1073; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 2 1074; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 2 1075; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]] 1076; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0 1077; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1078; OPT: loop-memcpy-expansion: 1079; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1080; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 1081; OPT-NEXT: [[TMP8:%.*]] = load i16, i16 addrspace(1)* [[TMP7]], align 2 1082; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 1083; OPT-NEXT: store i16 [[TMP8]], i16 addrspace(1)* [[TMP9]], align 2 1084; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1 1085; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]] 1086; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1087; OPT: loop-memcpy-residual: 1088; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1089; OPT-NEXT: [[TMP12:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)* 1090; OPT-NEXT: [[TMP13:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)* 1091; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 1092; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]] 1093; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 2 1094; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]] 1095; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 2 1096; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 1097; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]] 1098; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1099; OPT: post-loop-memcpy-expansion: 1100; OPT-NEXT: ret void 1101; OPT: loop-memcpy-residual-header: 1102; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0 1103; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1104; 1105 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 2 %src, i64 %n, i1 false) 1106 ret void 1107} 1108 1109define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { 1110; OPT-LABEL: @memcpy_global_align1_global_align1_variable( 1111; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 1112; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 1113; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16 1114; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16 1115; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]] 1116; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0 1117; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1118; OPT: loop-memcpy-expansion: 1119; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1120; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 1121; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1 1122; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 1123; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1 1124; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1 1125; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]] 1126; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1127; OPT: loop-memcpy-residual: 1128; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1129; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 1130; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 1131; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 1132; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]] 1133; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1 1134; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]] 1135; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1 1136; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 1137; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]] 1138; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1139; OPT: post-loop-memcpy-expansion: 1140; OPT-NEXT: ret void 1141; OPT: loop-memcpy-residual-header: 1142; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0 1143; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1144; 1145 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 1 %dst, i8 addrspace(1)* align 1 %src, i64 %n, i1 false) 1146 ret void 1147} 1148 1149define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { 1150; OPT-LABEL: @memcpy_local_align4_local_align4_variable( 1151; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)* 1152; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)* 1153; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8 1154; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8 1155; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]] 1156; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0 1157; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1158; OPT: loop-memcpy-expansion: 1159; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1160; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]] 1161; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 4 1162; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]] 1163; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 4 1164; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1 1165; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]] 1166; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1167; OPT: loop-memcpy-residual: 1168; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1169; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)* 1170; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)* 1171; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 1172; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]] 1173; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 4 1174; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]] 1175; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 4 1176; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1177; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]] 1178; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1179; OPT: post-loop-memcpy-expansion: 1180; OPT-NEXT: ret void 1181; OPT: loop-memcpy-residual-header: 1182; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0 1183; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1184; 1185 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 4 %dst, i8 addrspace(3)* align 4 %src, i32 %n, i1 false) 1186 ret void 1187} 1188 1189define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { 1190; OPT-LABEL: @memcpy_local_align2_local_align2_variable( 1191; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to i16 addrspace(3)* 1192; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to i16 addrspace(3)* 1193; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 2 1194; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 2 1195; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]] 1196; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0 1197; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1198; OPT: loop-memcpy-expansion: 1199; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1200; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]] 1201; OPT-NEXT: [[TMP8:%.*]] = load i16, i16 addrspace(3)* [[TMP7]], align 2 1202; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]] 1203; OPT-NEXT: store i16 [[TMP8]], i16 addrspace(3)* [[TMP9]], align 2 1204; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1 1205; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]] 1206; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1207; OPT: loop-memcpy-residual: 1208; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1209; OPT-NEXT: [[TMP12:%.*]] = bitcast i16 addrspace(3)* [[TMP1]] to i8 addrspace(3)* 1210; OPT-NEXT: [[TMP13:%.*]] = bitcast i16 addrspace(3)* [[TMP2]] to i8 addrspace(3)* 1211; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 1212; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]] 1213; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 2 1214; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]] 1215; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 2 1216; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1217; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]] 1218; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1219; OPT: post-loop-memcpy-expansion: 1220; OPT-NEXT: ret void 1221; OPT: loop-memcpy-residual-header: 1222; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0 1223; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1224; 1225 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 2 %dst, i8 addrspace(3)* align 2 %src, i32 %n, i1 false) 1226 ret void 1227} 1228 1229define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { 1230; OPT-LABEL: @memcpy_local_align1_local_align1_variable( 1231; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)* 1232; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)* 1233; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8 1234; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8 1235; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]] 1236; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0 1237; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1238; OPT: loop-memcpy-expansion: 1239; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1240; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]] 1241; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 1 1242; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]] 1243; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 1 1244; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1 1245; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]] 1246; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1247; OPT: loop-memcpy-residual: 1248; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1249; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)* 1250; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)* 1251; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 1252; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]] 1253; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 1 1254; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]] 1255; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 1 1256; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1257; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]] 1258; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1259; OPT: post-loop-memcpy-expansion: 1260; OPT-NEXT: ret void 1261; OPT: loop-memcpy-residual-header: 1262; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0 1263; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1264; 1265 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 1 %dst, i8 addrspace(3)* align 1 %src, i32 %n, i1 false) 1266 ret void 1267} 1268 1269define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %n) #0 { 1270; OPT-LABEL: @memcpy_local_align4_global_align4_variable( 1271; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <2 x i32> addrspace(1)* 1272; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)* 1273; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8 1274; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8 1275; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]] 1276; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0 1277; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1278; OPT: loop-memcpy-expansion: 1279; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1280; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], i32 [[LOOP_INDEX]] 1281; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP7]], align 4 1282; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]] 1283; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 4 1284; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1 1285; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]] 1286; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1287; OPT: loop-memcpy-residual: 1288; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1289; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)* 1290; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)* 1291; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 1292; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i32 [[TMP14]] 1293; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 4 1294; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]] 1295; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 4 1296; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1297; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]] 1298; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1299; OPT: post-loop-memcpy-expansion: 1300; OPT-NEXT: ret void 1301; OPT: loop-memcpy-residual-header: 1302; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0 1303; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1304; 1305 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %dst, i8 addrspace(1)* align 4 %src, i32 %n, i1 false) 1306 ret void 1307} 1308 1309define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { 1310; OPT-LABEL: @memcpy_global_align4_local_align4_variable( 1311; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)* 1312; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <2 x i32> addrspace(1)* 1313; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8 1314; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8 1315; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]] 1316; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0 1317; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1318; OPT: loop-memcpy-expansion: 1319; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1320; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]] 1321; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 4 1322; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP2]], i32 [[LOOP_INDEX]] 1323; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(1)* [[TMP9]], align 4 1324; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1 1325; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]] 1326; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1327; OPT: loop-memcpy-residual: 1328; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1329; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)* 1330; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)* 1331; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]] 1332; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]] 1333; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 4 1334; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i32 [[TMP14]] 1335; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 4 1336; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1337; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]] 1338; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1339; OPT: post-loop-memcpy-expansion: 1340; OPT-NEXT: ret void 1341; OPT: loop-memcpy-residual-header: 1342; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0 1343; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1344; 1345 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %dst, i8 addrspace(3)* align 4 %src, i32 %n, i1 false) 1346 ret void 1347} 1348 1349define amdgpu_kernel void @memcpy_global_align4_global_align4_16(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 1350; MAX1024-LABEL: @memcpy_global_align4_global_align4_16( 1351; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 16, i1 false) 1352; MAX1024-NEXT: ret void 1353; 1354; ALL-LABEL: @memcpy_global_align4_global_align4_16( 1355; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)* 1356; ALL-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)* 1357; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 1358; ALL: load-store-loop: 1359; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ] 1360; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]] 1361; ALL-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4 1362; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]] 1363; ALL-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4 1364; ALL-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1 1365; ALL-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 1 1366; ALL-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 1367; ALL: memcpy-split: 1368; ALL-NEXT: ret void 1369; 1370 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 16, i1 false) 1371 ret void 1372} 1373 1374define amdgpu_kernel void @memcpy_global_align4_global_align4_12(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 1375; MAX1024-LABEL: @memcpy_global_align4_global_align4_12( 1376; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 12, i1 false) 1377; MAX1024-NEXT: ret void 1378; 1379; ALL-LABEL: @memcpy_global_align4_global_align4_12( 1380; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)* 1381; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0 1382; ALL-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4 1383; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)* 1384; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0 1385; ALL-NEXT: store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4 1386; ALL-NEXT: [[TMP6:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to i32 addrspace(1)* 1387; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i64 2 1388; ALL-NEXT: [[TMP8:%.*]] = load i32, i32 addrspace(1)* [[TMP7]], align 4 1389; ALL-NEXT: [[TMP9:%.*]] = bitcast i8 addrspace(1)* [[DST]] to i32 addrspace(1)* 1390; ALL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP9]], i64 2 1391; ALL-NEXT: store i32 [[TMP8]], i32 addrspace(1)* [[TMP10]], align 4 1392; ALL-NEXT: ret void 1393; 1394 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 12, i1 false) 1395 ret void 1396} 1397 1398define amdgpu_kernel void @memcpy_global_align4_global_align4_8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 1399; MAX1024-LABEL: @memcpy_global_align4_global_align4_8( 1400; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 8, i1 false) 1401; MAX1024-NEXT: ret void 1402; 1403; ALL-LABEL: @memcpy_global_align4_global_align4_8( 1404; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)* 1405; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0 1406; ALL-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4 1407; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)* 1408; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0 1409; ALL-NEXT: store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4 1410; ALL-NEXT: ret void 1411; 1412 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 8, i1 false) 1413 ret void 1414} 1415 1416define amdgpu_kernel void @memcpy_global_align4_global_align4_10(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 1417; MAX1024-LABEL: @memcpy_global_align4_global_align4_10( 1418; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 10, i1 false) 1419; MAX1024-NEXT: ret void 1420; 1421; ALL-LABEL: @memcpy_global_align4_global_align4_10( 1422; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)* 1423; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0 1424; ALL-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4 1425; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)* 1426; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0 1427; ALL-NEXT: store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4 1428; ALL-NEXT: [[TMP6:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to i16 addrspace(1)* 1429; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP6]], i64 4 1430; ALL-NEXT: [[TMP8:%.*]] = load i16, i16 addrspace(1)* [[TMP7]], align 4 1431; ALL-NEXT: [[TMP9:%.*]] = bitcast i8 addrspace(1)* [[DST]] to i16 addrspace(1)* 1432; ALL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP9]], i64 4 1433; ALL-NEXT: store i16 [[TMP8]], i16 addrspace(1)* [[TMP10]], align 4 1434; ALL-NEXT: ret void 1435; 1436 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 10, i1 false) 1437 ret void 1438} 1439 1440define amdgpu_kernel void @memcpy_global_align4_global_align4_4(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 1441; MAX1024-LABEL: @memcpy_global_align4_global_align4_4( 1442; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 4, i1 false) 1443; MAX1024-NEXT: ret void 1444; 1445; ALL-LABEL: @memcpy_global_align4_global_align4_4( 1446; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i32 addrspace(1)* 1447; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i64 0 1448; ALL-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[TMP2]], align 4 1449; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i32 addrspace(1)* 1450; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i64 0 1451; ALL-NEXT: store i32 [[TMP3]], i32 addrspace(1)* [[TMP5]], align 4 1452; ALL-NEXT: ret void 1453; 1454 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 4, i1 false) 1455 ret void 1456} 1457 1458define amdgpu_kernel void @memcpy_global_align4_global_align4_2(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 1459; MAX1024-LABEL: @memcpy_global_align4_global_align4_2( 1460; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 2, i1 false) 1461; MAX1024-NEXT: ret void 1462; 1463; ALL-LABEL: @memcpy_global_align4_global_align4_2( 1464; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)* 1465; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 0 1466; ALL-NEXT: [[TMP3:%.*]] = load i16, i16 addrspace(1)* [[TMP2]], align 4 1467; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)* 1468; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP4]], i64 0 1469; ALL-NEXT: store i16 [[TMP3]], i16 addrspace(1)* [[TMP5]], align 4 1470; ALL-NEXT: ret void 1471; 1472 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 2, i1 false) 1473 ret void 1474} 1475 1476define amdgpu_kernel void @memcpy_global_align4_global_align4_1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { 1477; MAX1024-LABEL: @memcpy_global_align4_global_align4_1( 1478; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 1, i1 false) 1479; MAX1024-NEXT: ret void 1480; 1481; ALL-LABEL: @memcpy_global_align4_global_align4_1( 1482; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 0 1483; ALL-NEXT: [[TMP2:%.*]] = load i8, i8 addrspace(1)* [[TMP1]], align 4 1484; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 0 1485; ALL-NEXT: store i8 [[TMP2]], i8 addrspace(1)* [[TMP3]], align 4 1486; ALL-NEXT: ret void 1487; 1488 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1, i1 false) 1489 ret void 1490} 1491 1492attributes #0 = { nounwind } 1493attributes #1 = { argmemonly nounwind } 1494