1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s
4
5declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i1) #1
6declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1
7declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #1
8declare void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture readonly, i32, i1) #1
9declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1
10
11declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i1) #1
12declare void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1
13declare void @llvm.memmove.p5i8.p5i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture readonly, i32, i1) #1
14
15declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i1) #1
16
17; Test the upper bound for sizes to leave
18define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
19; MAX1024-LABEL: @max_size_small_static_memcpy_caller0(
20; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 addrspace(1)* [[SRC:%.*]], i64 1024, i1 false)
21; MAX1024-NEXT:    ret void
22;
23; ALL-LABEL: @max_size_small_static_memcpy_caller0(
24; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
25; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
26; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
27; ALL:       load-store-loop:
28; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
29; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
30; ALL-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 1
31; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
32; ALL-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 1
33; ALL-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
34; ALL-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
35; ALL-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
36; ALL:       memcpy-split:
37; ALL-NEXT:    ret void
38;
39  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false)
40  ret void
41}
42
43; Smallest static size which will be expanded
44define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
45; OPT-LABEL: @min_size_large_static_memcpy_caller0(
46; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
47; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
48; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
49; OPT:       load-store-loop:
50; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
51; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
52; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 1
53; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
54; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 1
55; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
56; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
57; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
58; OPT:       memcpy-split:
59; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
60; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1024
61; OPT-NEXT:    [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 1
62; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
63; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1024
64; OPT-NEXT:    store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 1
65; OPT-NEXT:    ret void
66;
67  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i1 false)
68  ret void
69}
70
71define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
72; MAX1024-LABEL: @max_size_small_static_memmove_caller0(
73; MAX1024-NEXT:    call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 addrspace(1)* [[SRC:%.*]], i64 1024, i1 false)
74; MAX1024-NEXT:    ret void
75;
76; ALL-LABEL: @max_size_small_static_memmove_caller0(
77; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult i8 addrspace(1)* [[SRC:%.*]], [[DST:%.*]]
78; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1024, 0
79; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
80; ALL:       copy_backwards:
81; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
82; ALL:       copy_backwards_loop:
83; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1024, [[COPY_BACKWARDS]] ]
84; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
85; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]]
86; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
87; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]]
88; ALL-NEXT:    store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]], align 1
89; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
90; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
91; ALL:       copy_forward:
92; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
93; ALL:       copy_forward_loop:
94; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
95; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]]
96; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]], align 1
97; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]]
98; ALL-NEXT:    store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]], align 1
99; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
100; ALL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1024
101; ALL-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
102; ALL:       memmove_done:
103; ALL-NEXT:    ret void
104;
105  call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false)
106  ret void
107}
108
109define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
110; OPT-LABEL: @min_size_large_static_memmove_caller0(
111; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult i8 addrspace(1)* [[SRC:%.*]], [[DST:%.*]]
112; OPT-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1025, 0
113; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
114; OPT:       copy_backwards:
115; OPT-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
116; OPT:       copy_backwards_loop:
117; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ]
118; OPT-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
119; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]]
120; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
121; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]]
122; OPT-NEXT:    store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]], align 1
123; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
124; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
125; OPT:       copy_forward:
126; OPT-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
127; OPT:       copy_forward_loop:
128; OPT-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
129; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]]
130; OPT-NEXT:    [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]], align 1
131; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]]
132; OPT-NEXT:    store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]], align 1
133; OPT-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
134; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025
135; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
136; OPT:       memmove_done:
137; OPT-NEXT:    ret void
138;
139  call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i1 false)
140  ret void
141}
142
143define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
144; MAX1024-LABEL: @max_size_small_static_memset_caller0(
145; MAX1024-NEXT:    call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 1024, i1 false)
146; MAX1024-NEXT:    ret void
147;
148; ALL-LABEL: @max_size_small_static_memset_caller0(
149; ALL-NEXT:    br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
150; ALL:       loadstoreloop:
151; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
152; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
153; ALL-NEXT:    store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]], align 1
154; ALL-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
155; ALL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024
156; ALL-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
157; ALL:       split:
158; ALL-NEXT:    ret void
159;
160  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i1 false)
161  ret void
162}
163
164define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
165; OPT-LABEL: @min_size_large_static_memset_caller0(
166; OPT-NEXT:    br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
167; OPT:       loadstoreloop:
168; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
169; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
170; OPT-NEXT:    store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]], align 1
171; OPT-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
172; OPT-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025
173; OPT-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
174; OPT:       split:
175; OPT-NEXT:    ret void
176;
177  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i1 false)
178  ret void
179}
180
181define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
182; OPT-LABEL: @variable_memcpy_caller0(
183; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
184; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
185; OPT-NEXT:    [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
186; OPT-NEXT:    [[TMP4:%.*]] = urem i64 [[N]], 16
187; OPT-NEXT:    [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
188; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
189; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
190; OPT:       loop-memcpy-expansion:
191; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
192; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
193; OPT-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
194; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
195; OPT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
196; OPT-NEXT:    [[TMP10]] = add i64 [[LOOP_INDEX]], 1
197; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
198; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
199; OPT:       loop-memcpy-residual:
200; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
201; OPT-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
202; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
203; OPT-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
204; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
205; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
206; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
207; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
208; OPT-NEXT:    [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
209; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
210; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
211; OPT:       post-loop-memcpy-expansion:
212; OPT-NEXT:    ret void
213; OPT:       loop-memcpy-residual-header:
214; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
215; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
216;
217  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i1 false)
218  ret void
219}
220
221define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
222; OPT-LABEL: @variable_memcpy_caller1(
223; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
224; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
225; OPT-NEXT:    [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
226; OPT-NEXT:    [[TMP4:%.*]] = urem i64 [[N]], 16
227; OPT-NEXT:    [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
228; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
229; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
230; OPT:       loop-memcpy-expansion:
231; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
232; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
233; OPT-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
234; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
235; OPT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
236; OPT-NEXT:    [[TMP10]] = add i64 [[LOOP_INDEX]], 1
237; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
238; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
239; OPT:       loop-memcpy-residual:
240; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
241; OPT-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
242; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
243; OPT-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
244; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
245; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
246; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
247; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
248; OPT-NEXT:    [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
249; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
250; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
251; OPT:       post-loop-memcpy-expansion:
252; OPT-NEXT:    ret void
253; OPT:       loop-memcpy-residual-header:
254; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
255; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
256;
257  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i1 false)
258  ret void
259}
260
261define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
262; OPT-LABEL: @memcpy_multi_use_one_function(
263; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
264; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)*
265; OPT-NEXT:    [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
266; OPT-NEXT:    [[TMP4:%.*]] = urem i64 [[N]], 16
267; OPT-NEXT:    [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
268; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
269; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
270; OPT:       loop-memcpy-expansion2:
271; OPT-NEXT:    [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ]
272; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX3]]
273; OPT-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
274; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX3]]
275; OPT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
276; OPT-NEXT:    [[TMP10]] = add i64 [[LOOP_INDEX3]], 1
277; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
278; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION2]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5]]
279; OPT:       loop-memcpy-residual4:
280; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL4:%.*]] ]
281; OPT-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
282; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
283; OPT-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX6]]
284; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
285; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
286; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
287; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
288; OPT-NEXT:    [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX6]], 1
289; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
290; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
291; OPT:       post-loop-memcpy-expansion1:
292; OPT-NEXT:    [[TMP20:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to <4 x i32> addrspace(1)*
293; OPT-NEXT:    [[TMP21:%.*]] = bitcast i8 addrspace(1)* [[DST1:%.*]] to <4 x i32> addrspace(1)*
294; OPT-NEXT:    [[TMP22:%.*]] = udiv i64 [[M:%.*]], 16
295; OPT-NEXT:    [[TMP23:%.*]] = urem i64 [[M]], 16
296; OPT-NEXT:    [[TMP24:%.*]] = sub i64 [[M]], [[TMP23]]
297; OPT-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP22]], 0
298; OPT-NEXT:    br i1 [[TMP25]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
299; OPT:       loop-memcpy-expansion:
300; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
301; OPT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP20]], i64 [[LOOP_INDEX]]
302; OPT-NEXT:    [[TMP27:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP26]], align 1
303; OPT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP21]], i64 [[LOOP_INDEX]]
304; OPT-NEXT:    store <4 x i32> [[TMP27]], <4 x i32> addrspace(1)* [[TMP28]], align 1
305; OPT-NEXT:    [[TMP29]] = add i64 [[LOOP_INDEX]], 1
306; OPT-NEXT:    [[TMP30:%.*]] = icmp ult i64 [[TMP29]], [[TMP22]]
307; OPT-NEXT:    br i1 [[TMP30]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
308; OPT:       loop-memcpy-residual:
309; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP37:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
310; OPT-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i8 addrspace(1)*
311; OPT-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i8 addrspace(1)*
312; OPT-NEXT:    [[TMP33:%.*]] = add i64 [[TMP24]], [[RESIDUAL_LOOP_INDEX]]
313; OPT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP31]], i64 [[TMP33]]
314; OPT-NEXT:    [[TMP35:%.*]] = load i8, i8 addrspace(1)* [[TMP34]], align 1
315; OPT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP32]], i64 [[TMP33]]
316; OPT-NEXT:    store i8 [[TMP35]], i8 addrspace(1)* [[TMP36]], align 1
317; OPT-NEXT:    [[TMP37]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
318; OPT-NEXT:    [[TMP38:%.*]] = icmp ult i64 [[TMP37]], [[TMP23]]
319; OPT-NEXT:    br i1 [[TMP38]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
320; OPT:       post-loop-memcpy-expansion:
321; OPT-NEXT:    ret void
322; OPT:       loop-memcpy-residual-header:
323; OPT-NEXT:    [[TMP39:%.*]] = icmp ne i64 [[TMP23]], 0
324; OPT-NEXT:    br i1 [[TMP39]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
325; OPT:       loop-memcpy-residual-header5:
326; OPT-NEXT:    [[TMP40:%.*]] = icmp ne i64 [[TMP4]], 0
327; OPT-NEXT:    br i1 [[TMP40]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]]
328;
329  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i1 false)
330  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i1 false)
331  ret void
332}
333
334define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
335; OPT-LABEL: @memcpy_alt_type(
336; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)*
337; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <2 x i32> addrspace(1)*
338; OPT-NEXT:    [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
339; OPT-NEXT:    [[TMP4:%.*]] = urem i32 [[N]], 8
340; OPT-NEXT:    [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
341; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
342; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
343; OPT:       loop-memcpy-expansion:
344; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
345; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
346; OPT-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 1
347; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP2]], i32 [[LOOP_INDEX]]
348; OPT-NEXT:    store <2 x i32> [[TMP8]], <2 x i32> addrspace(1)* [[TMP9]], align 1
349; OPT-NEXT:    [[TMP10]] = add i32 [[LOOP_INDEX]], 1
350; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
351; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
352; OPT:       loop-memcpy-residual:
353; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
354; OPT-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)*
355; OPT-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
356; OPT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
357; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
358; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 1
359; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i32 [[TMP14]]
360; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
361; OPT-NEXT:    [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
362; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
363; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
364; OPT:       post-loop-memcpy-expansion:
365; OPT-NEXT:    ret void
366; OPT:       loop-memcpy-residual-header:
367; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
368; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
369;
370  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i1 false)
371  ret void
372}
373
374; One of the uses in the function should be expanded, the other left alone.
375define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
376; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small(
377; MAX1024-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
378; MAX1024-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)*
379; MAX1024-NEXT:    [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
380; MAX1024-NEXT:    [[TMP4:%.*]] = urem i64 [[N]], 16
381; MAX1024-NEXT:    [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
382; MAX1024-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
383; MAX1024-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
384; MAX1024:       loop-memcpy-expansion:
385; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
386; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
387; MAX1024-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
388; MAX1024-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
389; MAX1024-NEXT:    store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
390; MAX1024-NEXT:    [[TMP10]] = add i64 [[LOOP_INDEX]], 1
391; MAX1024-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
392; MAX1024-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
393; MAX1024:       loop-memcpy-residual:
394; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
395; MAX1024-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
396; MAX1024-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
397; MAX1024-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
398; MAX1024-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
399; MAX1024-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
400; MAX1024-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
401; MAX1024-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
402; MAX1024-NEXT:    [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
403; MAX1024-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
404; MAX1024-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
405; MAX1024:       post-loop-memcpy-expansion:
406; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* [[DST1:%.*]], i8 addrspace(1)* [[SRC]], i64 102, i1 false)
407; MAX1024-NEXT:    ret void
408; MAX1024:       loop-memcpy-residual-header:
409; MAX1024-NEXT:    [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
410; MAX1024-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
411;
412; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
413; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
414; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)*
415; ALL-NEXT:    [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
416; ALL-NEXT:    [[TMP4:%.*]] = urem i64 [[N]], 16
417; ALL-NEXT:    [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
418; ALL-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
419; ALL-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
420; ALL:       loop-memcpy-expansion:
421; ALL-NEXT:    [[LOOP_INDEX1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
422; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX1]]
423; ALL-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
424; ALL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX1]]
425; ALL-NEXT:    store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
426; ALL-NEXT:    [[TMP10]] = add i64 [[LOOP_INDEX1]], 1
427; ALL-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
428; ALL-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
429; ALL:       loop-memcpy-residual:
430; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
431; ALL-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
432; ALL-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
433; ALL-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
434; ALL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
435; ALL-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
436; ALL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
437; ALL-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
438; ALL-NEXT:    [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
439; ALL-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
440; ALL-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
441; ALL:       post-loop-memcpy-expansion:
442; ALL-NEXT:    [[TMP20:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to <4 x i32> addrspace(1)*
443; ALL-NEXT:    [[TMP21:%.*]] = bitcast i8 addrspace(1)* [[DST1:%.*]] to <4 x i32> addrspace(1)*
444; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
445; ALL:       load-store-loop:
446; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION]] ], [ [[TMP25:%.*]], [[LOAD_STORE_LOOP]] ]
447; ALL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP20]], i64 [[LOOP_INDEX]]
448; ALL-NEXT:    [[TMP23:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP22]], align 1
449; ALL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP21]], i64 [[LOOP_INDEX]]
450; ALL-NEXT:    store <4 x i32> [[TMP23]], <4 x i32> addrspace(1)* [[TMP24]], align 1
451; ALL-NEXT:    [[TMP25]] = add i64 [[LOOP_INDEX]], 1
452; ALL-NEXT:    [[TMP26:%.*]] = icmp ult i64 [[TMP25]], 6
453; ALL-NEXT:    br i1 [[TMP26]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
454; ALL:       memcpy-split:
455; ALL-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i32 addrspace(1)*
456; ALL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP27]], i64 24
457; ALL-NEXT:    [[TMP29:%.*]] = load i32, i32 addrspace(1)* [[TMP28]], align 1
458; ALL-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i32 addrspace(1)*
459; ALL-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP30]], i64 24
460; ALL-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[TMP31]], align 1
461; ALL-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i16 addrspace(1)*
462; ALL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP32]], i64 50
463; ALL-NEXT:    [[TMP34:%.*]] = load i16, i16 addrspace(1)* [[TMP33]], align 1
464; ALL-NEXT:    [[TMP35:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i16 addrspace(1)*
465; ALL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP35]], i64 50
466; ALL-NEXT:    store i16 [[TMP34]], i16 addrspace(1)* [[TMP36]], align 1
467; ALL-NEXT:    ret void
468; ALL:       loop-memcpy-residual-header:
469; ALL-NEXT:    [[TMP37:%.*]] = icmp ne i64 [[TMP4]], 0
470; ALL-NEXT:    br i1 [[TMP37]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
471;
472  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i1 false)
473  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i1 false)
474  ret void
475}
476
477define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
478; OPT-LABEL: @memcpy_global_align4_global_align4_1028(
479; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
480; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
481; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
482; OPT:       load-store-loop:
483; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
484; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
485; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
486; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
487; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
488; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
489; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
490; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
491; OPT:       memcpy-split:
492; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)*
493; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP8]], i64 256
494; OPT-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(1)* [[TMP9]], align 4
495; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)*
496; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP11]], i64 256
497; OPT-NEXT:    store i32 [[TMP10]], i32 addrspace(1)* [[TMP12]], align 4
498; OPT-NEXT:    ret void
499;
500  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1028, i1 false)
501  ret void
502}
503
504define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
505; OPT-LABEL: @memcpy_global_align4_global_align4_1025(
506; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
507; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
508; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
509; OPT:       load-store-loop:
510; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
511; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
512; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
513; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
514; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
515; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
516; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
517; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
518; OPT:       memcpy-split:
519; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
520; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1024
521; OPT-NEXT:    [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 4
522; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
523; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1024
524; OPT-NEXT:    store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 4
525; OPT-NEXT:    ret void
526;
527  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1025, i1 false)
528  ret void
529}
530
531define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
532; OPT-LABEL: @memcpy_global_align4_global_align4_1026(
533; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
534; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
535; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
536; OPT:       load-store-loop:
537; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
538; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
539; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
540; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
541; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
542; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
543; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
544; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
545; OPT:       memcpy-split:
546; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
547; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP8]], i64 512
548; OPT-NEXT:    [[TMP10:%.*]] = load i16, i16 addrspace(1)* [[TMP9]], align 4
549; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
550; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP11]], i64 512
551; OPT-NEXT:    store i16 [[TMP10]], i16 addrspace(1)* [[TMP12]], align 4
552; OPT-NEXT:    ret void
553;
554  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1026, i1 false)
555  ret void
556}
557
558define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
559; OPT-LABEL: @memcpy_global_align4_global_align4_1032(
560; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
561; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
562; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
563; OPT:       load-store-loop:
564; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
565; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
566; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
567; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
568; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
569; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
570; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
571; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
572; OPT:       memcpy-split:
573; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
574; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
575; OPT-NEXT:    [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
576; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
577; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
578; OPT-NEXT:    store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
579; OPT-NEXT:    ret void
580;
581  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1032, i1 false)
582  ret void
583}
584
585define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
586; OPT-LABEL: @memcpy_global_align4_global_align4_1034(
587; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
588; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
589; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
590; OPT:       load-store-loop:
591; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
592; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
593; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
594; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
595; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
596; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
597; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
598; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
599; OPT:       memcpy-split:
600; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
601; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
602; OPT-NEXT:    [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
603; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
604; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
605; OPT-NEXT:    store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
606; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
607; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP13]], i64 516
608; OPT-NEXT:    [[TMP15:%.*]] = load i16, i16 addrspace(1)* [[TMP14]], align 4
609; OPT-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
610; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP16]], i64 516
611; OPT-NEXT:    store i16 [[TMP15]], i16 addrspace(1)* [[TMP17]], align 4
612; OPT-NEXT:    ret void
613;
614  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1034, i1 false)
615  ret void
616}
617
618define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
619; OPT-LABEL: @memcpy_global_align4_global_align4_1035(
620; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
621; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
622; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
623; OPT:       load-store-loop:
624; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
625; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
626; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
627; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
628; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
629; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
630; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
631; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
632; OPT:       memcpy-split:
633; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
634; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
635; OPT-NEXT:    [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
636; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
637; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
638; OPT-NEXT:    store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
639; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
640; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP13]], i64 516
641; OPT-NEXT:    [[TMP15:%.*]] = load i16, i16 addrspace(1)* [[TMP14]], align 4
642; OPT-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
643; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP16]], i64 516
644; OPT-NEXT:    store i16 [[TMP15]], i16 addrspace(1)* [[TMP17]], align 4
645; OPT-NEXT:    [[TMP18:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
646; OPT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP18]], i64 1034
647; OPT-NEXT:    [[TMP20:%.*]] = load i8, i8 addrspace(1)* [[TMP19]], align 2
648; OPT-NEXT:    [[TMP21:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
649; OPT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP21]], i64 1034
650; OPT-NEXT:    store i8 [[TMP20]], i8 addrspace(1)* [[TMP22]], align 2
651; OPT-NEXT:    ret void
652;
653  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1035, i1 false)
654  ret void
655}
656
657define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
658; OPT-LABEL: @memcpy_global_align4_global_align4_1036(
659; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
660; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
661; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
662; OPT:       load-store-loop:
663; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
664; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
665; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
666; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
667; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
668; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
669; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
670; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
671; OPT:       memcpy-split:
672; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
673; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
674; OPT-NEXT:    [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
675; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
676; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
677; OPT-NEXT:    store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
678; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)*
679; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP13]], i64 258
680; OPT-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[TMP14]], align 4
681; OPT-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)*
682; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP16]], i64 258
683; OPT-NEXT:    store i32 [[TMP15]], i32 addrspace(1)* [[TMP17]], align 4
684; OPT-NEXT:    ret void
685;
686  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1036, i1 false)
687  ret void
688}
689
690define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
691; OPT-LABEL: @memcpy_global_align4_global_align4_1039(
692; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
693; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
694; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
695; OPT:       load-store-loop:
696; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
697; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
698; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
699; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
700; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
701; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
702; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
703; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
704; OPT:       memcpy-split:
705; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
706; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
707; OPT-NEXT:    [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
708; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
709; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
710; OPT-NEXT:    store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
711; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)*
712; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP13]], i64 258
713; OPT-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[TMP14]], align 4
714; OPT-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)*
715; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP16]], i64 258
716; OPT-NEXT:    store i32 [[TMP15]], i32 addrspace(1)* [[TMP17]], align 4
717; OPT-NEXT:    [[TMP18:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
718; OPT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP18]], i64 518
719; OPT-NEXT:    [[TMP20:%.*]] = load i16, i16 addrspace(1)* [[TMP19]], align 4
720; OPT-NEXT:    [[TMP21:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
721; OPT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP21]], i64 518
722; OPT-NEXT:    store i16 [[TMP20]], i16 addrspace(1)* [[TMP22]], align 4
723; OPT-NEXT:    [[TMP23:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
724; OPT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP23]], i64 1038
725; OPT-NEXT:    [[TMP25:%.*]] = load i8, i8 addrspace(1)* [[TMP24]], align 2
726; OPT-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
727; OPT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP26]], i64 1038
728; OPT-NEXT:    store i8 [[TMP25]], i8 addrspace(1)* [[TMP27]], align 2
729; OPT-NEXT:    ret void
730;
731  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1039, i1 false)
732  ret void
733}
734
735define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
736; OPT-LABEL: @memcpy_global_align2_global_align2_1039(
737; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
738; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
739; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
740; OPT:       load-store-loop:
741; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
742; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
743; OPT-NEXT:    [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2
744; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
745; OPT-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2
746; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
747; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 519
748; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
749; OPT:       memcpy-split:
750; OPT-NEXT:    [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)*
751; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1038
752; OPT-NEXT:    [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2
753; OPT-NEXT:    [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)*
754; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1038
755; OPT-NEXT:    store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2
756; OPT-NEXT:    ret void
757;
758  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 2 %src, i64 1039, i1 false)
759  ret void
760}
761
762define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
763; OPT-LABEL: @memcpy_global_align4_global_align4_1027(
764; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
765; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
766; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
767; OPT:       load-store-loop:
768; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
769; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
770; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
771; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
772; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
773; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
774; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
775; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
776; OPT:       memcpy-split:
777; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
778; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP8]], i64 512
779; OPT-NEXT:    [[TMP10:%.*]] = load i16, i16 addrspace(1)* [[TMP9]], align 4
780; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
781; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP11]], i64 512
782; OPT-NEXT:    store i16 [[TMP10]], i16 addrspace(1)* [[TMP12]], align 4
783; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
784; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 1026
785; OPT-NEXT:    [[TMP15:%.*]] = load i8, i8 addrspace(1)* [[TMP14]], align 2
786; OPT-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
787; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP16]], i64 1026
788; OPT-NEXT:    store i8 [[TMP15]], i8 addrspace(1)* [[TMP17]], align 2
789; OPT-NEXT:    ret void
790;
791  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1027, i1 false)
792  ret void
793}
794
795define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
796; OPT-LABEL: @memcpy_global_align2_global_align4_1027(
797; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
798; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
799; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
800; OPT:       load-store-loop:
801; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
802; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
803; OPT-NEXT:    [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2
804; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
805; OPT-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2
806; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
807; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 513
808; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
809; OPT:       memcpy-split:
810; OPT-NEXT:    [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)*
811; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1026
812; OPT-NEXT:    [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2
813; OPT-NEXT:    [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)*
814; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1026
815; OPT-NEXT:    store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2
816; OPT-NEXT:    ret void
817;
818  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 4 %src, i64 1027, i1 false)
819  ret void
820}
821
822define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
823; OPT-LABEL: @memcpy_global_align4_global_align2_1027(
824; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
825; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
826; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
827; OPT:       load-store-loop:
828; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
829; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
830; OPT-NEXT:    [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2
831; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
832; OPT-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2
833; OPT-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
834; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 513
835; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
836; OPT:       memcpy-split:
837; OPT-NEXT:    [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)*
838; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1026
839; OPT-NEXT:    [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2
840; OPT-NEXT:    [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)*
841; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1026
842; OPT-NEXT:    store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2
843; OPT-NEXT:    ret void
844;
845  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 2 %src, i64 1027, i1 false)
846  ret void
847}
848
849define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
850; OPT-LABEL: @memcpy_private_align4_private_align4_1027(
851; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)*
852; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)*
853; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
854; OPT:       load-store-loop:
855; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
856; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
857; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 4
858; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
859; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 4
860; OPT-NEXT:    [[TMP6]] = add i32 [[LOOP_INDEX]], 1
861; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64
862; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
863; OPT:       memcpy-split:
864; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)*
865; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512
866; OPT-NEXT:    [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 4
867; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)*
868; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512
869; OPT-NEXT:    store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 4
870; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)*
871; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026
872; OPT-NEXT:    [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 2
873; OPT-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)*
874; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026
875; OPT-NEXT:    store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 2
876; OPT-NEXT:    ret void
877;
878  call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false)
879  ret void
880}
881
882define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
883; OPT-LABEL: @memcpy_private_align2_private_align4_1027(
884; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)*
885; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)*
886; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
887; OPT:       load-store-loop:
888; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
889; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
890; OPT-NEXT:    [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2
891; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
892; OPT-NEXT:    store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2
893; OPT-NEXT:    [[TMP6]] = add i32 [[LOOP_INDEX]], 1
894; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513
895; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
896; OPT:       memcpy-split:
897; OPT-NEXT:    [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)*
898; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026
899; OPT-NEXT:    [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2
900; OPT-NEXT:    [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)*
901; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026
902; OPT-NEXT:    store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2
903; OPT-NEXT:    ret void
904;
905  call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 2 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false)
906  ret void
907}
908
909define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
910; OPT-LABEL: @memcpy_private_align1_private_align4_1027(
911; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)*
912; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)*
913; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
914; OPT:       load-store-loop:
915; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
916; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
917; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 4
918; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
919; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 1
920; OPT-NEXT:    [[TMP6]] = add i32 [[LOOP_INDEX]], 1
921; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64
922; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
923; OPT:       memcpy-split:
924; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)*
925; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512
926; OPT-NEXT:    [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 4
927; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)*
928; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512
929; OPT-NEXT:    store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 1
930; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)*
931; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026
932; OPT-NEXT:    [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 2
933; OPT-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)*
934; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026
935; OPT-NEXT:    store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 1
936; OPT-NEXT:    ret void
937;
938  call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 1 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false)
939  ret void
940}
941
942define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
943; OPT-LABEL: @memcpy_private_align4_private_align2_1027(
944; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)*
945; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)*
946; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
947; OPT:       load-store-loop:
948; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
949; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
950; OPT-NEXT:    [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2
951; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
952; OPT-NEXT:    store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2
953; OPT-NEXT:    [[TMP6]] = add i32 [[LOOP_INDEX]], 1
954; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513
955; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
956; OPT:       memcpy-split:
957; OPT-NEXT:    [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)*
958; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026
959; OPT-NEXT:    [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2
960; OPT-NEXT:    [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)*
961; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026
962; OPT-NEXT:    store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2
963; OPT-NEXT:    ret void
964;
965  call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 2 %src, i32 1027, i1 false)
966  ret void
967}
968
969define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
970; OPT-LABEL: @memcpy_private_align4_private_align1_1027(
971; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)*
972; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)*
973; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
974; OPT:       load-store-loop:
975; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
976; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
977; OPT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 1
978; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
979; OPT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 4
980; OPT-NEXT:    [[TMP6]] = add i32 [[LOOP_INDEX]], 1
981; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64
982; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
983; OPT:       memcpy-split:
984; OPT-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)*
985; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512
986; OPT-NEXT:    [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 1
987; OPT-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)*
988; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512
989; OPT-NEXT:    store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 4
990; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)*
991; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026
992; OPT-NEXT:    [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 1
993; OPT-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)*
994; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026
995; OPT-NEXT:    store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 2
996; OPT-NEXT:    ret void
997;
998  call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 1 %src, i32 1027, i1 false)
999  ret void
1000}
1001
1002define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
1003; OPT-LABEL: @memcpy_private_align2_private_align2_1027(
1004; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)*
1005; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)*
1006; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
1007; OPT:       load-store-loop:
1008; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
1009; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
1010; OPT-NEXT:    [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2
1011; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
1012; OPT-NEXT:    store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2
1013; OPT-NEXT:    [[TMP6]] = add i32 [[LOOP_INDEX]], 1
1014; OPT-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513
1015; OPT-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1016; OPT:       memcpy-split:
1017; OPT-NEXT:    [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)*
1018; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026
1019; OPT-NEXT:    [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2
1020; OPT-NEXT:    [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)*
1021; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026
1022; OPT-NEXT:    store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2
1023; OPT-NEXT:    ret void
1024;
1025  call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 2 %dst, i8 addrspace(5)* align 2 %src, i32 1027, i1 false)
1026  ret void
1027}
1028
1029define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
1030; OPT-LABEL: @memcpy_global_align4_global_align4_variable(
1031; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
1032; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
1033; OPT-NEXT:    [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
1034; OPT-NEXT:    [[TMP4:%.*]] = urem i64 [[N]], 16
1035; OPT-NEXT:    [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
1036; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
1037; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1038; OPT:       loop-memcpy-expansion:
1039; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1040; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
1041; OPT-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 4
1042; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
1043; OPT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 4
1044; OPT-NEXT:    [[TMP10]] = add i64 [[LOOP_INDEX]], 1
1045; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
1046; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1047; OPT:       loop-memcpy-residual:
1048; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1049; OPT-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
1050; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
1051; OPT-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1052; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
1053; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 4
1054; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
1055; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 4
1056; OPT-NEXT:    [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
1057; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
1058; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1059; OPT:       post-loop-memcpy-expansion:
1060; OPT-NEXT:    ret void
1061; OPT:       loop-memcpy-residual-header:
1062; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
1063; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1064;
1065  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 %n, i1 false)
1066  ret void
1067}
1068
1069define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
1070; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
1071; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
1072; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
1073; OPT-NEXT:    [[TMP3:%.*]] = udiv i64 [[N:%.*]], 2
1074; OPT-NEXT:    [[TMP4:%.*]] = urem i64 [[N]], 2
1075; OPT-NEXT:    [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
1076; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
1077; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1078; OPT:       loop-memcpy-expansion:
1079; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1080; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
1081; OPT-NEXT:    [[TMP8:%.*]] = load i16, i16 addrspace(1)* [[TMP7]], align 2
1082; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
1083; OPT-NEXT:    store i16 [[TMP8]], i16 addrspace(1)* [[TMP9]], align 2
1084; OPT-NEXT:    [[TMP10]] = add i64 [[LOOP_INDEX]], 1
1085; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
1086; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1087; OPT:       loop-memcpy-residual:
1088; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1089; OPT-NEXT:    [[TMP12:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)*
1090; OPT-NEXT:    [[TMP13:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)*
1091; OPT-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1092; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
1093; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 2
1094; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
1095; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 2
1096; OPT-NEXT:    [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
1097; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
1098; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1099; OPT:       post-loop-memcpy-expansion:
1100; OPT-NEXT:    ret void
1101; OPT:       loop-memcpy-residual-header:
1102; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
1103; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1104;
1105  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 2 %src, i64 %n, i1 false)
1106  ret void
1107}
1108
1109define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
1110; OPT-LABEL: @memcpy_global_align1_global_align1_variable(
1111; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
1112; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
1113; OPT-NEXT:    [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
1114; OPT-NEXT:    [[TMP4:%.*]] = urem i64 [[N]], 16
1115; OPT-NEXT:    [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
1116; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
1117; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1118; OPT:       loop-memcpy-expansion:
1119; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1120; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
1121; OPT-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
1122; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
1123; OPT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
1124; OPT-NEXT:    [[TMP10]] = add i64 [[LOOP_INDEX]], 1
1125; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
1126; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1127; OPT:       loop-memcpy-residual:
1128; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1129; OPT-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
1130; OPT-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
1131; OPT-NEXT:    [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1132; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
1133; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
1134; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
1135; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
1136; OPT-NEXT:    [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
1137; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
1138; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1139; OPT:       post-loop-memcpy-expansion:
1140; OPT-NEXT:    ret void
1141; OPT:       loop-memcpy-residual-header:
1142; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
1143; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1144;
1145  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 1 %dst, i8 addrspace(1)* align 1 %src, i64 %n, i1 false)
1146  ret void
1147}
1148
1149define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
1150; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
1151; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)*
1152; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)*
1153; OPT-NEXT:    [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
1154; OPT-NEXT:    [[TMP4:%.*]] = urem i32 [[N]], 8
1155; OPT-NEXT:    [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1156; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1157; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1158; OPT:       loop-memcpy-expansion:
1159; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1160; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
1161; OPT-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 4
1162; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]]
1163; OPT-NEXT:    store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 4
1164; OPT-NEXT:    [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1165; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1166; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1167; OPT:       loop-memcpy-residual:
1168; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1169; OPT-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)*
1170; OPT-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)*
1171; OPT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1172; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
1173; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 4
1174; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]]
1175; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 4
1176; OPT-NEXT:    [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1177; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1178; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1179; OPT:       post-loop-memcpy-expansion:
1180; OPT-NEXT:    ret void
1181; OPT:       loop-memcpy-residual-header:
1182; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1183; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1184;
1185  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 4 %dst, i8 addrspace(3)* align 4 %src, i32 %n, i1 false)
1186  ret void
1187}
1188
1189define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
1190; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
1191; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to i16 addrspace(3)*
1192; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to i16 addrspace(3)*
1193; OPT-NEXT:    [[TMP3:%.*]] = udiv i32 [[N:%.*]], 2
1194; OPT-NEXT:    [[TMP4:%.*]] = urem i32 [[N]], 2
1195; OPT-NEXT:    [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1196; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1197; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1198; OPT:       loop-memcpy-expansion:
1199; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1200; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
1201; OPT-NEXT:    [[TMP8:%.*]] = load i16, i16 addrspace(3)* [[TMP7]], align 2
1202; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]]
1203; OPT-NEXT:    store i16 [[TMP8]], i16 addrspace(3)* [[TMP9]], align 2
1204; OPT-NEXT:    [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1205; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1206; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1207; OPT:       loop-memcpy-residual:
1208; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1209; OPT-NEXT:    [[TMP12:%.*]] = bitcast i16 addrspace(3)* [[TMP1]] to i8 addrspace(3)*
1210; OPT-NEXT:    [[TMP13:%.*]] = bitcast i16 addrspace(3)* [[TMP2]] to i8 addrspace(3)*
1211; OPT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1212; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
1213; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 2
1214; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]]
1215; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 2
1216; OPT-NEXT:    [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1217; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1218; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1219; OPT:       post-loop-memcpy-expansion:
1220; OPT-NEXT:    ret void
1221; OPT:       loop-memcpy-residual-header:
1222; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1223; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1224;
1225  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 2 %dst, i8 addrspace(3)* align 2 %src, i32 %n, i1 false)
1226  ret void
1227}
1228
1229define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
1230; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
1231; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)*
1232; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)*
1233; OPT-NEXT:    [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
1234; OPT-NEXT:    [[TMP4:%.*]] = urem i32 [[N]], 8
1235; OPT-NEXT:    [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1236; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1237; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1238; OPT:       loop-memcpy-expansion:
1239; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1240; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
1241; OPT-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 1
1242; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]]
1243; OPT-NEXT:    store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 1
1244; OPT-NEXT:    [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1245; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1246; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1247; OPT:       loop-memcpy-residual:
1248; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1249; OPT-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)*
1250; OPT-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)*
1251; OPT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1252; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
1253; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 1
1254; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]]
1255; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 1
1256; OPT-NEXT:    [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1257; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1258; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1259; OPT:       post-loop-memcpy-expansion:
1260; OPT-NEXT:    ret void
1261; OPT:       loop-memcpy-residual-header:
1262; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1263; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1264;
1265  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 1 %dst, i8 addrspace(3)* align 1 %src, i32 %n, i1 false)
1266  ret void
1267}
1268
1269define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %n) #0 {
1270; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
1271; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <2 x i32> addrspace(1)*
1272; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)*
1273; OPT-NEXT:    [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
1274; OPT-NEXT:    [[TMP4:%.*]] = urem i32 [[N]], 8
1275; OPT-NEXT:    [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1276; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1277; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1278; OPT:       loop-memcpy-expansion:
1279; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1280; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], i32 [[LOOP_INDEX]]
1281; OPT-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP7]], align 4
1282; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]]
1283; OPT-NEXT:    store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 4
1284; OPT-NEXT:    [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1285; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1286; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1287; OPT:       loop-memcpy-residual:
1288; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1289; OPT-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
1290; OPT-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)*
1291; OPT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1292; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i32 [[TMP14]]
1293; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 4
1294; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]]
1295; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 4
1296; OPT-NEXT:    [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1297; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1298; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1299; OPT:       post-loop-memcpy-expansion:
1300; OPT-NEXT:    ret void
1301; OPT:       loop-memcpy-residual-header:
1302; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1303; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1304;
1305  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %dst, i8 addrspace(1)* align 4 %src, i32 %n, i1 false)
1306  ret void
1307}
1308
1309define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
1310; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
1311; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)*
1312; OPT-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <2 x i32> addrspace(1)*
1313; OPT-NEXT:    [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
1314; OPT-NEXT:    [[TMP4:%.*]] = urem i32 [[N]], 8
1315; OPT-NEXT:    [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1316; OPT-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1317; OPT-NEXT:    br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1318; OPT:       loop-memcpy-expansion:
1319; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1320; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
1321; OPT-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 4
1322; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP2]], i32 [[LOOP_INDEX]]
1323; OPT-NEXT:    store <2 x i32> [[TMP8]], <2 x i32> addrspace(1)* [[TMP9]], align 4
1324; OPT-NEXT:    [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1325; OPT-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1326; OPT-NEXT:    br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1327; OPT:       loop-memcpy-residual:
1328; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1329; OPT-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)*
1330; OPT-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
1331; OPT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1332; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
1333; OPT-NEXT:    [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 4
1334; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i32 [[TMP14]]
1335; OPT-NEXT:    store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 4
1336; OPT-NEXT:    [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1337; OPT-NEXT:    [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1338; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1339; OPT:       post-loop-memcpy-expansion:
1340; OPT-NEXT:    ret void
1341; OPT:       loop-memcpy-residual-header:
1342; OPT-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1343; OPT-NEXT:    br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1344;
1345  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %dst, i8 addrspace(3)* align 4 %src, i32 %n, i1 false)
1346  ret void
1347}
1348
1349define amdgpu_kernel void @memcpy_global_align4_global_align4_16(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1350; MAX1024-LABEL: @memcpy_global_align4_global_align4_16(
1351; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 16, i1 false)
1352; MAX1024-NEXT:    ret void
1353;
1354; ALL-LABEL: @memcpy_global_align4_global_align4_16(
1355; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
1356; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
1357; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
1358; ALL:       load-store-loop:
1359; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
1360; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
1361; ALL-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
1362; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
1363; ALL-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
1364; ALL-NEXT:    [[TMP6]] = add i64 [[LOOP_INDEX]], 1
1365; ALL-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 1
1366; ALL-NEXT:    br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1367; ALL:       memcpy-split:
1368; ALL-NEXT:    ret void
1369;
1370  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 16, i1 false)
1371  ret void
1372}
1373
1374define amdgpu_kernel void @memcpy_global_align4_global_align4_12(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1375; MAX1024-LABEL: @memcpy_global_align4_global_align4_12(
1376; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 12, i1 false)
1377; MAX1024-NEXT:    ret void
1378;
1379; ALL-LABEL: @memcpy_global_align4_global_align4_12(
1380; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)*
1381; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0
1382; ALL-NEXT:    [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4
1383; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)*
1384; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0
1385; ALL-NEXT:    store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4
1386; ALL-NEXT:    [[TMP6:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to i32 addrspace(1)*
1387; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i64 2
1388; ALL-NEXT:    [[TMP8:%.*]] = load i32, i32 addrspace(1)* [[TMP7]], align 4
1389; ALL-NEXT:    [[TMP9:%.*]] = bitcast i8 addrspace(1)* [[DST]] to i32 addrspace(1)*
1390; ALL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP9]], i64 2
1391; ALL-NEXT:    store i32 [[TMP8]], i32 addrspace(1)* [[TMP10]], align 4
1392; ALL-NEXT:    ret void
1393;
1394  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 12, i1 false)
1395  ret void
1396}
1397
1398define amdgpu_kernel void @memcpy_global_align4_global_align4_8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1399; MAX1024-LABEL: @memcpy_global_align4_global_align4_8(
1400; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 8, i1 false)
1401; MAX1024-NEXT:    ret void
1402;
1403; ALL-LABEL: @memcpy_global_align4_global_align4_8(
1404; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)*
1405; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0
1406; ALL-NEXT:    [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4
1407; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)*
1408; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0
1409; ALL-NEXT:    store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4
1410; ALL-NEXT:    ret void
1411;
1412  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 8, i1 false)
1413  ret void
1414}
1415
1416define amdgpu_kernel void @memcpy_global_align4_global_align4_10(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1417; MAX1024-LABEL: @memcpy_global_align4_global_align4_10(
1418; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 10, i1 false)
1419; MAX1024-NEXT:    ret void
1420;
1421; ALL-LABEL: @memcpy_global_align4_global_align4_10(
1422; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)*
1423; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0
1424; ALL-NEXT:    [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4
1425; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)*
1426; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0
1427; ALL-NEXT:    store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4
1428; ALL-NEXT:    [[TMP6:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to i16 addrspace(1)*
1429; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP6]], i64 4
1430; ALL-NEXT:    [[TMP8:%.*]] = load i16, i16 addrspace(1)* [[TMP7]], align 4
1431; ALL-NEXT:    [[TMP9:%.*]] = bitcast i8 addrspace(1)* [[DST]] to i16 addrspace(1)*
1432; ALL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP9]], i64 4
1433; ALL-NEXT:    store i16 [[TMP8]], i16 addrspace(1)* [[TMP10]], align 4
1434; ALL-NEXT:    ret void
1435;
1436  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 10, i1 false)
1437  ret void
1438}
1439
1440define amdgpu_kernel void @memcpy_global_align4_global_align4_4(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1441; MAX1024-LABEL: @memcpy_global_align4_global_align4_4(
1442; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 4, i1 false)
1443; MAX1024-NEXT:    ret void
1444;
1445; ALL-LABEL: @memcpy_global_align4_global_align4_4(
1446; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i32 addrspace(1)*
1447; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i64 0
1448; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[TMP2]], align 4
1449; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i32 addrspace(1)*
1450; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i64 0
1451; ALL-NEXT:    store i32 [[TMP3]], i32 addrspace(1)* [[TMP5]], align 4
1452; ALL-NEXT:    ret void
1453;
1454  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 4, i1 false)
1455  ret void
1456}
1457
1458define amdgpu_kernel void @memcpy_global_align4_global_align4_2(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1459; MAX1024-LABEL: @memcpy_global_align4_global_align4_2(
1460; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 2, i1 false)
1461; MAX1024-NEXT:    ret void
1462;
1463; ALL-LABEL: @memcpy_global_align4_global_align4_2(
1464; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
1465; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 0
1466; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16 addrspace(1)* [[TMP2]], align 4
1467; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
1468; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP4]], i64 0
1469; ALL-NEXT:    store i16 [[TMP3]], i16 addrspace(1)* [[TMP5]], align 4
1470; ALL-NEXT:    ret void
1471;
1472  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 2, i1 false)
1473  ret void
1474}
1475
1476define amdgpu_kernel void @memcpy_global_align4_global_align4_1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1477; MAX1024-LABEL: @memcpy_global_align4_global_align4_1(
1478; MAX1024-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 1, i1 false)
1479; MAX1024-NEXT:    ret void
1480;
1481; ALL-LABEL: @memcpy_global_align4_global_align4_1(
1482; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 0
1483; ALL-NEXT:    [[TMP2:%.*]] = load i8, i8 addrspace(1)* [[TMP1]], align 4
1484; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 0
1485; ALL-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* [[TMP3]], align 4
1486; ALL-NEXT:    ret void
1487;
1488  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1, i1 false)
1489  ret void
1490}
1491
1492attributes #0 = { nounwind }
1493attributes #1 = { argmemonly nounwind }
1494