1; RUN: llc -O3 -march=hexagon < %s | FileCheck %s
2
3; CHECK: loop0(.[[BLOCK:LBB0_[0-9]+]]
4; CHECK: .[[BLOCK]]:
5; CHECK: = vmemu({{r[0-9]+}}++#1)
6; CHECK: = vmemu({{r[0-9]+}}++#1)
7; CHECK: = vmemu({{r[0-9]+}}++#1)
8; CHECK: = vmemu({{r[0-9]+}}++#1)
9; CHECK: endloop0
10
11target triple = "hexagon-unknown--elf"
12
13%0 = type { i8*, i32, i32, i32, i32, %1*, %1*, %1* }
14%1 = type { %2 }
15%2 = type { i64 }
16%3 = type { i8*, i32, i32, i32, i32, i32, i32, i8*, i32, i32* }
17%4 = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8, i8, [6 x i8] }
18
19@g0 = private unnamed_addr constant [5 x i8] c"Load\00", align 1
20@g1 = private unnamed_addr constant [6 x i8] c"Store\00", align 1
21@g2 = private unnamed_addr constant [18 x i8] c"Begin realization\00", align 1
22@g3 = private unnamed_addr constant [16 x i8] c"End realization\00", align 1
23@g4 = private unnamed_addr constant [8 x i8] c"Produce\00", align 1
24@g5 = private unnamed_addr constant [7 x i8] c"Update\00", align 1
25@g6 = private unnamed_addr constant [8 x i8] c"Consume\00", align 1
26@g7 = private unnamed_addr constant [12 x i8] c"End consume\00", align 1
27@g8 = private constant [6 x i8] c"input\00", align 32
28@g9 = private constant [10 x i8] c"dilate3x3\00", align 32
29@g10 = private constant [2 x %0] [%0 { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @g8, i32 0, i32 0), i32 1, i32 2, i32 1, i32 8, %1* null, %1* null, %1* null }, %0 { i8* getelementptr inbounds ([10 x i8], [10 x i8]* @g9, i32 0, i32 0), i32 2, i32 2, i32 1, i32 8, %1* null, %1* null, %1* null }]
30@g11 = private constant [64 x i8] c"...............................................................\00", align 32
31
32; Function Attrs: nounwind
33declare i8* @f0(i8*, i32) #0
34
35; Function Attrs: nounwind
36declare void @f1(i8*, i8*) #0
37
38; Function Attrs: nounwind
39declare void @f2(i8*, i8*) #0
40
41; Function Attrs: nounwind
42declare i32 @f3(i8*, %3*) #0
43
44; Function Attrs: nounwind
45declare void @f4() #0
46
47; Function Attrs: nounwind
48declare void @f5() #0
49
50; Function Attrs: nounwind
51define i32 @f6(%4* noalias nocapture readonly %a0, %4* noalias nocapture readonly %a1) #0 {
52b0:
53  %v0 = getelementptr inbounds %4, %4* %a0, i32 0, i32 1
54  %v1 = load i8*, i8** %v0, align 4
55  %v2 = getelementptr inbounds %4, %4* %a0, i32 0, i32 3, i32 1
56  %v3 = load i32, i32* %v2, align 4
57  %v4 = getelementptr inbounds %4, %4* %a0, i32 0, i32 4, i32 0
58  %v5 = load i32, i32* %v4, align 4
59  %v6 = getelementptr inbounds %4, %4* %a0, i32 0, i32 4, i32 1
60  %v7 = load i32, i32* %v6, align 4
61  %v8 = getelementptr inbounds %4, %4* %a1, i32 0, i32 1
62  %v9 = load i8*, i8** %v8, align 4
63  %v10 = getelementptr inbounds %4, %4* %a1, i32 0, i32 2, i32 0
64  %v11 = load i32, i32* %v10, align 4
65  %v12 = getelementptr inbounds %4, %4* %a1, i32 0, i32 3, i32 1
66  %v13 = load i32, i32* %v12, align 4
67  %v14 = getelementptr inbounds %4, %4* %a1, i32 0, i32 4, i32 0
68  %v15 = load i32, i32* %v14, align 4
69  %v16 = getelementptr inbounds %4, %4* %a1, i32 0, i32 4, i32 1
70  %v17 = load i32, i32* %v16, align 4
71  %v18 = getelementptr inbounds %4, %4* %a1, i32 0, i32 2, i32 1
72  %v19 = load i32, i32* %v18, align 4
73  %v20 = add nsw i32 %v19, %v17
74  %v21 = icmp sgt i32 %v19, 0
75  br i1 %v21, label %b1, label %b11, !prof !3
76
77b1:                                               ; preds = %b0
78  %v22 = ashr i32 %v11, 7
79  %v23 = icmp slt i32 %v22, 0
80  %v24 = select i1 %v23, i32 0, i32 %v22
81  %v25 = icmp sgt i32 %v24, 0
82  br i1 %v25, label %b5, label %b7, !prof !3
83
84b2:                                               ; preds = %b5, %b2
85  %v26 = phi i32 [ %v90, %b2 ], [ 0, %b5 ]
86  %v27 = mul nsw i32 %v7, %v3
87  %v28 = add nsw i32 %v27, %v5
88  %v29 = shl nsw i32 %v26, 7
89  %v30 = add nsw i32 %v29, %v15
90  %v31 = add nsw i32 %v150, -1
91  %v32 = mul nsw i32 %v31, %v3
92  %v33 = mul nsw i32 %v150, %v3
93  %v34 = add nsw i32 %v150, 1
94  %v35 = mul nsw i32 %v34, %v3
95  %v36 = sub i32 %v32, %v28
96  %v37 = add i32 %v36, %v30
97  %v38 = add nsw i32 %v37, -1
98  %v39 = getelementptr inbounds i8, i8* %v1, i32 %v38
99  %v40 = bitcast i8* %v39 to <32 x i32>*
100  %v41 = load <32 x i32>, <32 x i32>* %v40, align 1, !tbaa !4
101  %v42 = getelementptr inbounds i8, i8* %v1, i32 %v37
102  %v43 = bitcast i8* %v42 to <32 x i32>*
103  %v44 = load <32 x i32>, <32 x i32>* %v43, align 1, !tbaa !4
104  %v45 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v41, <32 x i32> %v44)
105  %v46 = add nsw i32 %v37, 1
106  %v47 = getelementptr inbounds i8, i8* %v1, i32 %v46
107  %v48 = bitcast i8* %v47 to <32 x i32>*
108  %v49 = load <32 x i32>, <32 x i32>* %v48, align 1, !tbaa !4
109  %v50 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v45, <32 x i32> %v49)
110  %v51 = sub i32 %v33, %v28
111  %v52 = add i32 %v51, %v30
112  %v53 = add nsw i32 %v52, -1
113  %v54 = getelementptr inbounds i8, i8* %v1, i32 %v53
114  %v55 = bitcast i8* %v54 to <32 x i32>*
115  %v56 = load <32 x i32>, <32 x i32>* %v55, align 1, !tbaa !4
116  %v57 = getelementptr inbounds i8, i8* %v1, i32 %v52
117  %v58 = bitcast i8* %v57 to <32 x i32>*
118  %v59 = load <32 x i32>, <32 x i32>* %v58, align 1, !tbaa !4
119  %v60 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v56, <32 x i32> %v59)
120  %v61 = add nsw i32 %v52, 1
121  %v62 = getelementptr inbounds i8, i8* %v1, i32 %v61
122  %v63 = bitcast i8* %v62 to <32 x i32>*
123  %v64 = load <32 x i32>, <32 x i32>* %v63, align 1, !tbaa !4
124  %v65 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v60, <32 x i32> %v64)
125  %v66 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v50, <32 x i32> %v65)
126  %v67 = sub i32 %v35, %v28
127  %v68 = add i32 %v67, %v30
128  %v69 = add nsw i32 %v68, -1
129  %v70 = getelementptr inbounds i8, i8* %v1, i32 %v69
130  %v71 = bitcast i8* %v70 to <32 x i32>*
131  %v72 = load <32 x i32>, <32 x i32>* %v71, align 1, !tbaa !4
132  %v73 = getelementptr inbounds i8, i8* %v1, i32 %v68
133  %v74 = bitcast i8* %v73 to <32 x i32>*
134  %v75 = load <32 x i32>, <32 x i32>* %v74, align 1, !tbaa !4
135  %v76 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v72, <32 x i32> %v75)
136  %v77 = add nsw i32 %v68, 1
137  %v78 = getelementptr inbounds i8, i8* %v1, i32 %v77
138  %v79 = bitcast i8* %v78 to <32 x i32>*
139  %v80 = load <32 x i32>, <32 x i32>* %v79, align 1, !tbaa !4
140  %v81 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v76, <32 x i32> %v80)
141  %v82 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v66, <32 x i32> %v81)
142  %v83 = mul nsw i32 %v150, %v13
143  %v84 = mul nsw i32 %v17, %v13
144  %v85 = add i32 %v84, %v15
145  %v86 = sub i32 %v83, %v85
146  %v87 = add i32 %v86, %v30
147  %v88 = getelementptr inbounds i8, i8* %v9, i32 %v87
148  %v89 = bitcast i8* %v88 to <32 x i32>*
149  store <32 x i32> %v82, <32 x i32>* %v89, align 1, !tbaa !7
150  %v90 = add nuw nsw i32 %v26, 1
151  %v91 = icmp eq i32 %v90, %v24
152  br i1 %v91, label %b6, label %b2
153
154b3:                                               ; preds = %b6, %b3
155  %v92 = phi i32 [ %v147, %b3 ], [ %v24, %b6 ]
156  %v93 = add nsw i32 %v15, %v11
157  %v94 = sub i32 %v93, %v28
158  %v95 = add i32 %v94, %v32
159  %v96 = add nsw i32 %v95, -129
160  %v97 = getelementptr inbounds i8, i8* %v1, i32 %v96
161  %v98 = bitcast i8* %v97 to <32 x i32>*
162  %v99 = load <32 x i32>, <32 x i32>* %v98, align 1, !tbaa !4
163  %v100 = add nsw i32 %v95, -128
164  %v101 = getelementptr inbounds i8, i8* %v1, i32 %v100
165  %v102 = bitcast i8* %v101 to <32 x i32>*
166  %v103 = load <32 x i32>, <32 x i32>* %v102, align 1, !tbaa !4
167  %v104 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v99, <32 x i32> %v103)
168  %v105 = add nsw i32 %v95, -127
169  %v106 = getelementptr inbounds i8, i8* %v1, i32 %v105
170  %v107 = bitcast i8* %v106 to <32 x i32>*
171  %v108 = load <32 x i32>, <32 x i32>* %v107, align 1, !tbaa !4
172  %v109 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v104, <32 x i32> %v108)
173  %v110 = add i32 %v94, %v33
174  %v111 = add nsw i32 %v110, -129
175  %v112 = getelementptr inbounds i8, i8* %v1, i32 %v111
176  %v113 = bitcast i8* %v112 to <32 x i32>*
177  %v114 = load <32 x i32>, <32 x i32>* %v113, align 1, !tbaa !4
178  %v115 = add nsw i32 %v110, -128
179  %v116 = getelementptr inbounds i8, i8* %v1, i32 %v115
180  %v117 = bitcast i8* %v116 to <32 x i32>*
181  %v118 = load <32 x i32>, <32 x i32>* %v117, align 1, !tbaa !4
182  %v119 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v114, <32 x i32> %v118)
183  %v120 = add nsw i32 %v110, -127
184  %v121 = getelementptr inbounds i8, i8* %v1, i32 %v120
185  %v122 = bitcast i8* %v121 to <32 x i32>*
186  %v123 = load <32 x i32>, <32 x i32>* %v122, align 1, !tbaa !4
187  %v124 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v119, <32 x i32> %v123)
188  %v125 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v109, <32 x i32> %v124)
189  %v126 = add i32 %v94, %v35
190  %v127 = add nsw i32 %v126, -129
191  %v128 = getelementptr inbounds i8, i8* %v1, i32 %v127
192  %v129 = bitcast i8* %v128 to <32 x i32>*
193  %v130 = load <32 x i32>, <32 x i32>* %v129, align 1, !tbaa !4
194  %v131 = add nsw i32 %v126, -128
195  %v132 = getelementptr inbounds i8, i8* %v1, i32 %v131
196  %v133 = bitcast i8* %v132 to <32 x i32>*
197  %v134 = load <32 x i32>, <32 x i32>* %v133, align 1, !tbaa !4
198  %v135 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v130, <32 x i32> %v134)
199  %v136 = add nsw i32 %v126, -127
200  %v137 = getelementptr inbounds i8, i8* %v1, i32 %v136
201  %v138 = bitcast i8* %v137 to <32 x i32>*
202  %v139 = load <32 x i32>, <32 x i32>* %v138, align 1, !tbaa !4
203  %v140 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v135, <32 x i32> %v139)
204  %v141 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v125, <32 x i32> %v140)
205  %v142 = add i32 %v11, -128
206  %v143 = sub i32 %v142, %v84
207  %v144 = add i32 %v143, %v83
208  %v145 = getelementptr inbounds i8, i8* %v9, i32 %v144
209  %v146 = bitcast i8* %v145 to <32 x i32>*
210  store <32 x i32> %v141, <32 x i32>* %v146, align 1, !tbaa !7
211  %v147 = add nuw nsw i32 %v92, 1
212  %v148 = icmp eq i32 %v147, %v152
213  br i1 %v148, label %b4, label %b3
214
215b4:                                               ; preds = %b6, %b3
216  %v149 = icmp eq i32 %v34, %v20
217  br i1 %v149, label %b11, label %b5
218
219b5:                                               ; preds = %b4, %b1
220  %v150 = phi i32 [ %v34, %b4 ], [ %v17, %b1 ]
221  br label %b2
222
223b6:                                               ; preds = %b2
224  %v151 = add nsw i32 %v11, 127
225  %v152 = ashr i32 %v151, 7
226  %v153 = icmp slt i32 %v24, %v152
227  br i1 %v153, label %b3, label %b4, !prof !3
228
229b7:                                               ; preds = %b1
230  %v154 = add nsw i32 %v11, 127
231  %v155 = ashr i32 %v154, 7
232  %v156 = icmp slt i32 %v24, %v155
233  br i1 %v156, label %b9, label %b11, !prof !3
234
235b8:                                               ; preds = %b9, %b8
236  %v157 = phi i32 [ %v221, %b8 ], [ %v24, %b9 ]
237  %v158 = mul nsw i32 %v7, %v3
238  %v159 = add nsw i32 %v158, %v5
239  %v160 = add nsw i32 %v15, %v11
240  %v161 = add nsw i32 %v223, -1
241  %v162 = mul nsw i32 %v161, %v3
242  %v163 = mul nsw i32 %v223, %v3
243  %v164 = add nsw i32 %v223, 1
244  %v165 = mul nsw i32 %v164, %v3
245  %v166 = sub i32 %v160, %v159
246  %v167 = add i32 %v166, %v162
247  %v168 = add nsw i32 %v167, -129
248  %v169 = getelementptr inbounds i8, i8* %v1, i32 %v168
249  %v170 = bitcast i8* %v169 to <32 x i32>*
250  %v171 = load <32 x i32>, <32 x i32>* %v170, align 1, !tbaa !4
251  %v172 = add nsw i32 %v167, -128
252  %v173 = getelementptr inbounds i8, i8* %v1, i32 %v172
253  %v174 = bitcast i8* %v173 to <32 x i32>*
254  %v175 = load <32 x i32>, <32 x i32>* %v174, align 1, !tbaa !4
255  %v176 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v171, <32 x i32> %v175)
256  %v177 = add nsw i32 %v167, -127
257  %v178 = getelementptr inbounds i8, i8* %v1, i32 %v177
258  %v179 = bitcast i8* %v178 to <32 x i32>*
259  %v180 = load <32 x i32>, <32 x i32>* %v179, align 1, !tbaa !4
260  %v181 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v176, <32 x i32> %v180)
261  %v182 = add i32 %v166, %v163
262  %v183 = add nsw i32 %v182, -129
263  %v184 = getelementptr inbounds i8, i8* %v1, i32 %v183
264  %v185 = bitcast i8* %v184 to <32 x i32>*
265  %v186 = load <32 x i32>, <32 x i32>* %v185, align 1, !tbaa !4
266  %v187 = add nsw i32 %v182, -128
267  %v188 = getelementptr inbounds i8, i8* %v1, i32 %v187
268  %v189 = bitcast i8* %v188 to <32 x i32>*
269  %v190 = load <32 x i32>, <32 x i32>* %v189, align 1, !tbaa !4
270  %v191 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v186, <32 x i32> %v190)
271  %v192 = add nsw i32 %v182, -127
272  %v193 = getelementptr inbounds i8, i8* %v1, i32 %v192
273  %v194 = bitcast i8* %v193 to <32 x i32>*
274  %v195 = load <32 x i32>, <32 x i32>* %v194, align 1, !tbaa !4
275  %v196 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v191, <32 x i32> %v195)
276  %v197 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v181, <32 x i32> %v196)
277  %v198 = add i32 %v166, %v165
278  %v199 = add nsw i32 %v198, -129
279  %v200 = getelementptr inbounds i8, i8* %v1, i32 %v199
280  %v201 = bitcast i8* %v200 to <32 x i32>*
281  %v202 = load <32 x i32>, <32 x i32>* %v201, align 1, !tbaa !4
282  %v203 = add nsw i32 %v198, -128
283  %v204 = getelementptr inbounds i8, i8* %v1, i32 %v203
284  %v205 = bitcast i8* %v204 to <32 x i32>*
285  %v206 = load <32 x i32>, <32 x i32>* %v205, align 1, !tbaa !4
286  %v207 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v202, <32 x i32> %v206)
287  %v208 = add nsw i32 %v198, -127
288  %v209 = getelementptr inbounds i8, i8* %v1, i32 %v208
289  %v210 = bitcast i8* %v209 to <32 x i32>*
290  %v211 = load <32 x i32>, <32 x i32>* %v210, align 1, !tbaa !4
291  %v212 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v207, <32 x i32> %v211)
292  %v213 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v197, <32 x i32> %v212)
293  %v214 = mul nsw i32 %v223, %v13
294  %v215 = mul nsw i32 %v17, %v13
295  %v216 = add i32 %v11, -128
296  %v217 = sub i32 %v216, %v215
297  %v218 = add i32 %v217, %v214
298  %v219 = getelementptr inbounds i8, i8* %v9, i32 %v218
299  %v220 = bitcast i8* %v219 to <32 x i32>*
300  store <32 x i32> %v213, <32 x i32>* %v220, align 1, !tbaa !7
301  %v221 = add nuw nsw i32 %v157, 1
302  %v222 = icmp eq i32 %v221, %v155
303  br i1 %v222, label %b10, label %b8
304
305b9:                                               ; preds = %b10, %b7
306  %v223 = phi i32 [ %v164, %b10 ], [ %v17, %b7 ]
307  br label %b8
308
309b10:                                              ; preds = %b8
310  %v224 = icmp eq i32 %v164, %v20
311  br i1 %v224, label %b11, label %b9
312
313b11:                                              ; preds = %b10, %b7, %b4, %b0
314  ret i32 0
315}
316
317; Function Attrs: nounwind readnone
318declare <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32>, <32 x i32>) #1
319
320; Function Attrs: nounwind
321define i32 @f7(%4* noalias nocapture readonly %a0, %4* noalias nocapture readonly %a1) #0 {
322b0:
323  %v0 = tail call i32 @f6(%4* %a0, %4* %a1) #0
324  ret i32 0
325}
326
327; Function Attrs: nounwind
328define i32 @f8(i8** nocapture readonly %a0) #0 {
329b0:
330  %v0 = bitcast i8** %a0 to %4**
331  %v1 = load %4*, %4** %v0, align 4
332  %v2 = getelementptr i8*, i8** %a0, i32 1
333  %v3 = bitcast i8** %v2 to %4**
334  %v4 = load %4*, %4** %v3, align 4
335  %v5 = tail call i32 @f7(%4* %v1, %4* %v4)
336  ret i32 0
337}
338
339attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length128b" }
340attributes #1 = { nounwind readnone }
341
342!llvm.module.flags = !{!0, !1, !2}
343
344!0 = !{i32 2, !"halide_use_soft_float_abi", i32 0}
345!1 = !{i32 2, !"halide_mcpu", !"hexagonv60"}
346!2 = !{i32 2, !"halide_mattrs", !"+hvxv60,+hvx-length64b"}
347!3 = !{!"branch_weights", i32 1073741824, i32 0}
348!4 = !{!5, !5, i64 0}
349!5 = !{!"input", !6}
350!6 = !{!"Halide buffer"}
351!7 = !{!8, !8, i64 0}
352!8 = !{!"dilate3x3", !6}
353