1// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-pipeline-data-transfer | FileCheck %s
2
3// -----
4
5// CHECK-DAG: [[$MOD_2:#map[0-9]+]] = affine_map<(d0) -> (d0 mod 2)>
6// CHECK-DAG: [[$MAP_MINUS_1:#map[0-9]+]] = affine_map<(d0) -> (d0 - 1)>
7
8// CHECK-LABEL: func @loop_nest_dma() {
9func @loop_nest_dma() {
10
11  %A = alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
12  %Ah = alloc() : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
13
14  %tag = alloc() : memref<1 x f32>
15
16  %zero = constant 0 : index
17  %num_elts = constant 32 : index
18
19  affine.for %i = 0 to 8 {
20    affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
21    affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
22    %v = affine.load %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
23    %r = "compute"(%v) : (f32) -> (f32)
24    affine.store %r, %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
25    affine.for %j = 0 to 32 {
26      "do_more_compute"(%i, %j) : (index, index) -> ()
27    }
28  }
29  dealloc %tag : memref<1 x f32>
30  dealloc %Ah : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
31  return
32}
33// CHECK:       %{{.*}} = alloc() : memref<256xf32>
34// CHECK:       %{{.*}} = alloc() : memref<2x32xf32, 1>
35// CHECK-NEXT:  %{{.*}} = alloc() : memref<2x1xf32>
36// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
37// CHECK-NEXT:  affine.for %{{.*}} = 1 to 8 {
38// CHECK-NEXT:    affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
39// CHECK-NEXT:    affine.apply [[$MAP_MINUS_1]](%{{.*}})
40// CHECK-NEXT:    affine.apply [[$MOD_2]](%{{.*}})
41// CHECK-NEXT:    affine.apply [[$MOD_2]](%{{.*}})
42// CHECK-NEXT:    affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
43// CHECK-NEXT:    affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
44// CHECK-NEXT:    "compute"(%{{.*}}) : (f32) -> f32
45// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
46// CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 {
47// CHECK-NEXT:      "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
48// CHECK-NEXT:    }
49// CHECK-NEXT:  }
50// CHECK-NEXT:  affine.apply [[$MAP_MINUS_1]](%{{.*}})
51// CHECK-NEXT:  affine.apply [[$MOD_2]](%{{.*}})
52// CHECK-NEXT:  affine.apply [[$MOD_2]](%{{.*}})
53// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
54// CHECK-NEXT:  affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
55// CHECK-NEXT:  "compute"(%{{.*}}) : (f32) -> f32
56// CHECK-NEXT:  affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
57// CHECK-NEXT:  affine.for %{{.*}} = 0 to 32 {
58// CHECK-NEXT:    "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
59// CHECK-NEXT:  }
60// CHECK-NEXT:  dealloc %{{.*}} : memref<2x1xf32>
61// CHECK-NEXT:  dealloc %{{.*}} : memref<2x32xf32, 1>
62// CHECK-NEXT:  return
63// CHECK-NEXT:}
64
65// -----
66
67// CHECK-DAG: [[$FLOOR_MOD_2:#map[0-9]+]] = affine_map<(d0) -> ((d0 floordiv 4) mod 2)>
68// CHECK-DAG: [[$REMAP_SHIFT_MINUS_4:#map[0-9]+]] = affine_map<(d0) -> (d0 - 4)>
69
70// CHECK-LABEL: @loop_step
71func @loop_step(%arg0: memref<512xf32>,
72                  %arg1: memref<512xf32>) {
73  %c0 = constant 0 : index
74  %c4 = constant 4 : index
75  affine.for %i0 = 0 to 512 step 4 {
76    %1 = alloc() : memref<4xf32, 1>
77    %2 = alloc() : memref<1xi32>
78    affine.dma_start %arg0[%i0], %1[%c0], %2[%c0], %c4,
79              : memref<512xf32>, memref<4xf32, 1>, memref<1xi32>
80    affine.dma_wait %2[%c0], %c4 : memref<1xi32>
81    "compute"(%i0) : (index) -> ()
82    dealloc %2 : memref<1xi32>
83    dealloc %1 : memref<4xf32, 1>
84  }
85  return
86}
87// CHECK:        [[BUF:%[0-9]+]] = alloc() : memref<2x4xf32, 1>
88// CHECK:        [[TAG:%[0-9]+]] = alloc() : memref<2x1xi32>
89// CHECK-NEXT:   affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
90// CHECK-NEXT:   affine.for %{{.*}} = 4 to 512 step 4 {
91// CHECK-NEXT:     affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
92// CHECK-NEXT:     affine.apply [[$REMAP_SHIFT_MINUS_4]](%{{.*}})
93// CHECK-NEXT:     affine.apply [[$FLOOR_MOD_2]](%{{.*}})
94// CHECK:          affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
95// CHECK-NEXT:     "compute"(%{{.*}}) : (index) -> ()
96// CHECK-NEXT:   }
97// CHECK-NEXT:   [[SHIFTED:%[0-9]+]] = affine.apply [[$REMAP_SHIFT_MINUS_4]](%{{.*}})
98// CHECK-NEXT:   %{{.*}} = affine.apply [[$FLOOR_MOD_2]]([[SHIFTED]])
99// CHECK:        affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
100// CHECK-NEXT:   "compute"(%{{.*}}) : (index) -> ()
101// CHECK-NEXT:   dealloc [[TAG]] : memref<2x1xi32>
102// CHECK-NEXT:   dealloc [[BUF]] : memref<2x4xf32, 1>
103// CHECK-NEXT:   return
104// CHECK-NEXT: }
105
106// -----
107
108#map1 = affine_map<(d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32)>
109#map2 = affine_map<(d0) -> ((d0 * 2048) floordiv 32)>
110// CHECK-LABEL: func @loop_dma_nested(%{{.*}}: memref<512x32xvector<8xf32>
111func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>>, %arg1: memref<512x32xvector<8xf32>>, %arg2: memref<512x32xvector<8xf32>>) {
112  %num_elts = constant 256 : index
113  %c0 = constant 0 : index
114  %0 = alloc() : memref<64x4xvector<8xf32>, 2>
115  %1 = alloc() : memref<64x4xvector<8xf32>, 2>
116  %2 = alloc() : memref<64x4xvector<8xf32>, 2>
117  %3 = alloc() : memref<2xi32>
118  %4 = alloc() : memref<2xi32>
119  %5 = alloc() : memref<2xi32>
120  // Prologue for DMA overlap on arg2.
121  // CHECK-DAG: [[BUF_ARG2:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
122  // CHECK-DAG: [[TAG_ARG2:%[0-9]+]] = alloc() : memref<2x2xi32>
123  // CHECK: affine.dma_start %{{.*}}[
124  // CHECK: affine.for %{{.*}} = 1 to 8 {
125  affine.for %i0 = 0 to 8 {
126    %6 = affine.apply #map2(%i0)
127    affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
128    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
129    // Steady state for DMA overlap on arg2
130    // CHECK: affine.dma_start %{{.*}}[
131    // CHECK: affine.dma_wait [[TAG_ARG2]]
132    // Prologue for DMA overlap on arg0, arg1 nested within i0
133    // CHECK: [[BUF_ARG0:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
134    // CHECK: [[BUF_ARG1:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
135    // CHECK: [[TAG_ARG0:%[0-9]+]] = alloc() : memref<2x2xi32>
136    // CHECK: [[TAG_ARG1:%[0-9]+]] = alloc() : memref<2x2xi32>
137    // CHECK: affine.dma_start %{{.*}}[
138    // CHECK: affine.dma_start %{{.*}}[
139    // CHECK-NEXT: affine.for %{{.*}} = 1 to 8 {
140    affine.for %i1 = 0 to 8 {
141      %7 = affine.apply #map1(%i0, %i1)
142      %8 = affine.apply #map2(%i1)
143      affine.dma_start %arg0[%7, %c0], %0[%c0, %c0], %3[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
144      affine.dma_start %arg1[%8, %c0], %1[%c0, %c0], %4[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
145      affine.dma_wait %3[%c0], %num_elts : memref<2xi32>
146      affine.dma_wait %4[%c0], %num_elts : memref<2xi32>
147      // Steady state for DMA overlap on arg0, arg1
148      // CHECK: affine.dma_start %{{.*}}[
149      // CHECK: affine.dma_start %{{.*}}[
150      // CHECK: affine.dma_wait [[TAG_ARG0]]
151      // CHECK: affine.dma_wait [[TAG_ARG1]]
152      // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
153      affine.for %i2 = 0 to 4 {
154        "foo"() : () -> ()
155      }
156    }
157    // epilogue for arg0, arg1
158    // CHECK: affine.dma_wait [[TAG_ARG0]]
159    // CHECK: affine.dma_wait [[TAG_ARG1]]
160    // CHECK-DAG:    dealloc [[TAG_ARG1]] : memref<2x2xi32>
161    // CHECK-DAG:    dealloc [[TAG_ARG0]] : memref<2x2xi32>
162    // CHECK-DAG:    dealloc [[BUF_ARG1]] : memref<2x64x4xvector<8xf32>, 2>
163    // CHECK-DAG:    dealloc [[BUF_ARG0]] : memref<2x64x4xvector<8xf32>, 2>
164  // epilogue for DMA overlap on %arg2
165  // CHECK:  affine.dma_wait [[TAG_ARG2]]
166  // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
167  // CHECK: [[BUF_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
168  // CHECK: [[BUF_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
169  // CHECK: [[TAG_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32>
170  // CHECK: [[TAG_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32>
171  // CHECK:  affine.dma_start %{{.*}}[
172  // CHECK:  affine.dma_start %{{.*}}[
173  // CHECK:  affine.for %{{.*}} = 1 to 8 {
174  // CHECK:    affine.dma_start %{{.*}}[
175  // CHECK:    affine.dma_start %{{.*}}[
176  // CHECK:    affine.dma_wait [[TAG_ARG0_NESTED]]
177  // CHECK:    affine.dma_wait [[TAG_ARG1_NESTED]]
178  // CHECK:    affine.for %{{.*}} = 0 to 4 {
179  // CHECK:      "foo"() : () -> ()
180  // CHECK:  affine.dma_wait [[TAG_ARG0_NESTED]]
181  // CHECK:  affine.dma_wait [[TAG_ARG1_NESTED]]
182  // CHECK:  affine.for %{{.*}} = 0 to 4 {
183  }
184  dealloc %5 : memref<2xi32>
185  dealloc %4 : memref<2xi32>
186  dealloc %3 : memref<2xi32>
187  dealloc %2 : memref<64x4xvector<8xf32>, 2>
188  dealloc %1 : memref<64x4xvector<8xf32>, 2>
189  dealloc %0 : memref<64x4xvector<8xf32>, 2>
190  return
191// CHECK: }
192// CHECK-DAG:  dealloc [[TAG_ARG1_NESTED]] : memref<2x2xi32>
193// CHECK-DAG:  dealloc [[TAG_ARG0_NESTED]] : memref<2x2xi32>
194// CHECK-DAG:  dealloc [[BUF_ARG1_NESTED]] : memref<2x64x4xvector<8xf32>, 2>
195// CHECK-DAG:  dealloc [[BUF_ARG0_NESTED]] : memref<2x64x4xvector<8xf32>, 2>
196// CHECK-DAG:  dealloc [[TAG_ARG2]] : memref<2x2xi32>
197// CHECK-DAG:  dealloc [[BUF_ARG2]] : memref<2x64x4xvector<8xf32>, 2>
198// CHECK-NEXT: return
199}
200
201// -----
202#map2 = affine_map<(d0) -> ((d0 * 2048) floordiv 32)>
203
204// CHECK: func @loop_dma_dependent
205func @loop_dma_dependent(%arg2: memref<512x32xvector<8xf32>>) {
206  %num_elts = constant 256 : index
207  %c0 = constant 0 : index
208  %0 = alloc() : memref<64x4xvector<8xf32>, 2>
209  %1 = alloc() : memref<64x4xvector<8xf32>, 2>
210  %2 = alloc() : memref<64x4xvector<8xf32>, 2>
211  %3 = alloc() : memref<2xi32>
212  %4 = alloc() : memref<2xi32>
213  %5 = alloc() : memref<2xi32>
214
215  // The two DMAs below are dependent (incoming and outgoing on the same
216  // memref) in the same iteration; so no pipelining here.
217  // CHECK-NOT: affine.dma_start
218  // CHECK: affine.for %{{.*}} = 0 to 8 {
219  affine.for %i0 = 0 to 8 {
220    %6 = affine.apply #map2(%i0)
221    affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
222    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
223
224    affine.dma_start %2[%c0, %c0], %arg2[%6, %c0], %5[%c0], %num_elts : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
225    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
226  }
227  dealloc %5 : memref<2xi32>
228  dealloc %4 : memref<2xi32>
229  dealloc %3 : memref<2xi32>
230  dealloc %2 : memref<64x4xvector<8xf32>, 2>
231  dealloc %1 : memref<64x4xvector<8xf32>, 2>
232  dealloc %0 : memref<64x4xvector<8xf32>, 2>
233  return
234}
235
236// -----
237
238// CHECK-LABEL: func @escaping_use
239func @escaping_use(%arg0: memref<512 x 32 x f32>) {
240  %c32 = constant 32 : index
241  %num_elt = constant 512 : index
242  %zero = constant 0 : index
243  %Av = alloc() : memref<32 x 32 x f32, 2>
244  %tag = alloc() : memref<1 x i32>
245
246  // CHECK-NOT: affine.dma_start
247  // CHECK: affine.for %{{.*}} = 0 to 16 {
248  affine.for %kTT = 0 to 16 {
249    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
250      memref<512 x 32 x f32>,
251      memref<32 x 32 x f32, 2>, memref<1 x i32>
252    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
253    // escaping use; no DMA pipelining / double buffering will be done.
254    "foo"(%Av) : (memref<32 x 32 x f32, 2>) -> ()
255  }
256  dealloc %tag : memref<1 x i32>
257  dealloc %Av : memref<32 x 32 x f32, 2>
258  return
259// CHECK:        "foo"(%{{[0-9]+}}) : (memref<32x32xf32, 2>) -> ()
260// CHECK:      }
261// CHECK:      return
262}
263
264// -----
265
266// CHECK-LABEL: func @escaping_tag
267func @escaping_tag(%arg0: memref<512 x 32 x f32>) {
268  %c32 = constant 32 : index
269  %num_elt = constant 512 : index
270  %zero = constant 0 : index
271  %Av = alloc() : memref<32 x 32 x f32, 2>
272  %tag = alloc() : memref<1 x i32>
273
274  // CHECK-NOT: affine.dma_start
275  // CHECK: affine.for %{{.*}} = 0 to 16 {
276  affine.for %kTT = 0 to 16 {
277    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
278      memref<512 x 32 x f32>,
279      memref<32 x 32 x f32, 2>, memref<1 x i32>
280    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
281    // escaping use; no DMA pipelining / double buffering will be done.
282    "foo"(%tag) : (memref<1 x i32>) -> ()
283  }
284  dealloc %tag : memref<1 x i32>
285  dealloc %Av : memref<32 x 32 x f32, 2>
286  return
287// CHECK:        "foo"(%{{[0-9]+}}) : (memref<1xi32>) -> ()
288// CHECK:      }
289// CHECK:      return
290}
291
292
293// -----
294
295// CHECK-LABEL: func @live_out_use
296func @live_out_use(%arg0: memref<512 x 32 x f32>) -> f32 {
297  %c32 = constant 32 : index
298  %num_elt = constant 512 : index
299  %zero = constant 0 : index
300  %Av = alloc() : memref<32 x 32 x f32, 2>
301  %tag = alloc() : memref<1 x i32>
302
303  // CHECK-NOT: affine.dma_start
304  // CHECK: affine.for %{{.*}} = 0 to 16 {
305  affine.for %kTT = 0 to 16 {
306    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
307      memref<512 x 32 x f32>,
308      memref<32 x 32 x f32, 2>, memref<1 x i32>
309    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
310  }
311  // Use live out of 'affine.for' op; no DMA pipelining will be done.
312  %v = affine.load %Av[%zero, %zero] : memref<32 x 32 x f32, 2>
313  dealloc %tag : memref<1 x i32>
314  dealloc %Av : memref<32 x 32 x f32, 2>
315  return %v : f32
316// CHECK:      affine.load %{{[0-9]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2>
317// CHECK:      return
318}
319
320// -----
321
322// CHECK-LABEL: func @dynamic_shape_dma_buffer
323func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) {
324  %c32 = constant 32 : index
325  %num_elt = constant 512 : index
326  %zero = constant 0 : index
327
328  %Av = alloc(%c32, %c32) : memref<? x ? x f32, 2>
329  %tag = alloc() : memref<1 x i32>
330
331// Double buffering for dynamic shaped buffer.
332// CHECK:       alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 2>
333// CHECK-NEXT:  %[[C0:.*]] = constant 0 : index
334// CHECK-NEXT:  dim %{{.*}}, %[[C0]] : memref<?x?xf32, 2>
335// CHECK-NEXT:  %[[C1:.*]] = constant 1 : index
336// CHECK-NEXT:  dim %{{.*}}, %[[C1]] : memref<?x?xf32, 2>
337// CHECK-NEXT:  alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2>
338// CHECK:       affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
339  affine.for %kTT = 0 to 16 {
340    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
341      memref<512 x 32 x f32>,
342      memref<? x ? x f32, 2>, memref<1 x i32>
343    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
344  }
345  dealloc %Av : memref<? x ? x f32, 2>
346  return
347// CHECK-NEXT:  affine.for %{{.*}} = 1 to 16 {
348// CHECK:         affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
349// CHECK:         affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32>
350// CHECK:       }
351// CHECK:       affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32>
352// CHECK:       return
353}
354
355// Memref replacement will fail here due to a non-dereferencing use. However,
356// no incorrect transformation is performed in spite of one of the uses being a
357// dereferencing one since replaceAllMemRefUsesWith checks for escaping uses
358// before performing any replacement.
359// CHECK-LABEL: func @escaping_and_indexed_use_mix
360func @escaping_and_indexed_use_mix() {
361  %A = alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
362  %Ah = alloc() : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
363  %tag = alloc() : memref<1 x f32>
364  %zero = constant 0 : index
365  %num_elts = constant 32 : index
366
367  // alloc for the buffer is created but no replacement should happen.
368  affine.for %i = 0 to 8 {
369    affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
370    affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
371    "compute"(%Ah) : (memref<32 x f32, 1>) -> ()
372    %v = affine.load %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
373    "foo"(%v) : (f32) -> ()
374  }
375  dealloc %A : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
376  dealloc %Ah : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
377  return
378}
379// No replacement.
380// CHECK: affine.for %{{.*}} = 0 to 8 {
381// CHECK-NEXT:   affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
382// CHECK-NEXT:   affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32>
383// CHECK-NEXT:   "compute"(%{{.*}}) : (memref<32xf32, 1>) -> ()
384// CHECK-NEXT:   [[VAL:%[0-9]+]] = affine.load %{{.*}}[%{{.*}}] : memref<32xf32, 1>
385// CHECK-NEXT:   "foo"([[VAL]]) : (f32) -> ()
386