1// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 skip-non-unit-stride-loops" | FileCheck %s
2// Small buffer size to trigger fine copies.
3// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 fast-mem-capacity=1" | FileCheck --check-prefix=CHECK-SMALL %s
4
5// Test affine data copy with a memref filter. We use a test pass that invokes
6// affine data copy utility on the input loop nest.
7// '-test-affine-data-copy-memref-filter' passes the first memref found in an
8// affine.load op in the innermost loop as a filter.
9// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER
10// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION
11
12// -copy-skip-non-stride-loops forces the copies to be placed right inside the
13// tile space loops, avoiding the sensitivity of copy placement depth to memory
14// footprint -- so that one could write a definite test case and not have to
15// update it each time something related to the cost functions change.
16
17#id = affine_map<(d0) -> (d0)>
18#ub = affine_map<(d0) -> (d0 + 128)>
19
20// Map used to index the buffer while computing.
21// CHECK-DAG: [[$MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
22// CHECK-DAG: [[$MAP_PLUS_128:map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>
23
24// CHECK-LABEL: func @matmul
25// FILTER-LABEL: func @matmul
26func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
27  affine.for %i = 0 to 4096 step 128 {
28    affine.for %j = 0 to 4096 step 128 {
29      affine.for %k = 0 to 4096 step 128 {
30        affine.for %ii = #id(%i) to #ub(%i) {
31          affine.for %jj = #id(%j) to #ub(%j) {
32            affine.for %kk = #id(%k) to #ub(%k) {
33              %5 = affine.load %A[%ii, %kk] : memref<4096x4096xf32>
34              %6 = affine.load %B[%kk, %jj] : memref<4096x4096xf32>
35              %7 = affine.load %C[%ii, %jj] : memref<4096x4096xf32>
36              %8 = mulf %5, %6 : f32
37              %9 = addf %7, %8 : f32
38              affine.store %9, %C[%ii, %jj] : memref<4096x4096xf32>
39            }
40          }
41        }
42      }
43    }
44  }
45  return %C : memref<4096x4096xf32>
46}
47
48// Buffers of size 128x128 get created here for all three matrices.
49
50// CHECK: affine.for %[[I:.*]] = 0 to 4096 step 128 {
51// CHECK:   affine.for %[[J:.*]] = 0 to 4096 step 128 {
52// CHECK:     [[BUFC:%[0-9]+]] = alloc() : memref<128x128xf32>
53// The result matrix's copy gets hoisted out.
54// Result matrix copy-in.
55// CHECK:     affine.for %[[II:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
56// CHECK:       affine.for %[[JJ:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
57// CHECK:         affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
58// CHECK:         affine.store %{{.*}}, [[BUFC]][-%[[I]] + %[[II]], -%[[J]] + %[[JJ]]] : memref<128x128xf32>
59// CHECK:       }
60// CHECK:     }
61
62// LHS matrix copy-in.
63// CHECK:     affine.for %[[K:.*]] = 0 to 4096 step 128 {
64// CHECK:      [[BUFA:%[0-9]+]] = alloc() : memref<128x128xf32>
65// CHECK:       affine.for %[[II:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
66// CHECK:         affine.for %[[KK:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
67// CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
68// CHECK:           affine.store %{{.*}}, [[BUFA]][-%[[I]] + %[[II]], -%[[K]] + %[[KK]]] : memref<128x128xf32>
69// CHECK:         }
70// CHECK:       }
71
72// RHS matrix copy-in.
73// CHECK:       [[BUFB:%[0-9]+]] = alloc() : memref<128x128xf32>
74// CHECK:       affine.for %[[KK:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
75// CHECK:         affine.for %[[JJ:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
76// CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
77// CHECK:           affine.store %{{.*}}, [[BUFB]][-%[[K]] + %[[KK]], -%[[J]] + %[[JJ]]] : memref<128x128xf32>
78// CHECK:         }
79// CHECK:       }
80
81// Computation on the fast buffers.
82// CHECK:       affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
83// CHECK:         affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
84// CHECK:           affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
85// CHECK:             affine.load [[BUFA]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
86// CHECK:             affine.load [[BUFB]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
87// CHECK:             affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
88// CHECK:             mulf %{{.*}}, %{{.*}} : f32
89// CHECK:             addf %{{.*}}, %{{.*}} : f32
90// CHECK:             affine.store %{{.*}}, [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
91// CHECK:           }
92// CHECK:         }
93// CHECK:       }
94// CHECK:       dealloc [[BUFB]] : memref<128x128xf32>
95// CHECK:       dealloc [[BUFA]] : memref<128x128xf32>
96// CHECK:     }
97
98// Result matrix copy out.
99// CHECK:     affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
100// CHECK:       affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) {
101// CHECK:         affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
102// CHECK:         store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
103// CHECK:       }
104// CHECK:     }
105// CHECK:     dealloc [[BUFC]] : memref<128x128xf32>
106// CHECK:   }
107// CHECK: }
108
109// Check that only one memref is copied when memref filter is used.
110
111//      FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
112//      FILTER:   alloc() : memref<128x4096xf32>
113//  FILTER-NOT:   alloc()
114//      FILTER:   affine.for
115//      FILTER:     affine.for %{{.*}} = 0 to 4096 {
116//      FILTER:   affine.for %{{.*}} = 0 to 4096 step 128 {
117// FILTER-NEXT:     affine.for %{{.*}} = 0 to 4096 step 128 {
118// FILTER-NEXT:       affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
119// FILTER-NEXT:         affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
120// FILTER-NEXT:           affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
121//      FILTER:   dealloc %{{.*}} : memref<128x4096xf32>
122//  FILTER-NOT:   dealloc %{{.*}} : memref<128x4096xf32>
123
124// -----
125
126//
127// This test case will lead to single element buffers. These are eventually
128// expected to be turned into registers via alloca and mem2reg.
129//
130// CHECK-SMALL-LABEL: func @single_elt_buffers
131// FILTER-LABEL: func @single_elt_buffers
132// MEMREF_REGION-LABEL: func @single_elt_buffers
133func @single_elt_buffers(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
134  affine.for %i = 0 to 1024 {
135    affine.for %j = 0 to 1024 {
136      affine.for %k = 0 to 1024 {
137        %6 = affine.load %arg1[%k, %j] : memref<1024x1024xf32>
138        %7 = affine.load %arg2[%i, %j] : memref<1024x1024xf32>
139        %9 = addf %6, %7 : f32
140        affine.store %9, %arg2[%i, %j] : memref<1024x1024xf32>
141      }
142    }
143  }
144  return %arg2 : memref<1024x1024xf32>
145}
146// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
147// CHECK-SMALL:   affine.for %arg{{.*}} = 0 to 1024 {
148// CHECK-SMALL:     alloc() : memref<1x1xf32>
149// CHECK-SMALL:     affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
150// CHECK-SMALL:     affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
151// CHECK-SMALL:     affine.for %arg{{.*}} = 0 to 1024 {
152// CHECK-SMALL:       alloc() : memref<1x1xf32>
153// CHECK-SMALL:       affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
154// CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
155// CHECK-SMALL:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
156// CHECK-SMALL:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
157// CHECK-SMALL:       addf %{{.*}}, %{{.*}} : f32
158// CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
159// CHECK-SMALL:       dealloc %{{.*}} : memref<1x1xf32>
160// CHECK-SMALL:     }
161// CHECK-SMALL:     affine.load %{{.*}}[0, 0] : memref<1x1xf32>
162// CHECK-SMALL:     affine.store %{{.*}}, %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
163// CHECK-SMALL:     dealloc %{{.*}} : memref<1x1xf32>
164// CHECK-SMALL:   }
165// CHECK-SMALL: }
166// CHECK-SMALL: return
167
168// Check that only one memref is copied when memref filter is used.
169
170//      FILTER: alloc() : memref<1024x1024xf32>
171//  FILTER-NOT: alloc()
172//      FILTER: affine.for %{{.*}} = 0 to 1024 {
173//      FILTER:   affine.for %{{.*}} = 0 to 1024 {
174//      FILTER: affine.for %{{.*}} = 0 to 1024 {
175// FILTER-NEXT:   affine.for %{{.*}} = 0 to 1024 {
176// FILTER-NEXT:     affine.for %{{.*}} = 0 to 1024 {
177//      FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
178//  FILTER-NOT: dealloc
179//  FILTER:     return
180
181// CHeck that only one memref is copied, because for-memref-region is enabled
182// (and the first ever encountered load is analyzed).
183//      MEMREF_REGION: alloc() : memref<1024x1024xf32>
184//  MEMREF_REGION-NOT: alloc()
185//      MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
186//      MEMREF_REGION:   affine.for %{{.*}} = 0 to 1024 {
187//      MEMREF_REGION:   }
188//      MEMREF_REGION: }
189// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
190// MEMREF_REGION-NEXT:   affine.for %{{.*}} = 0 to 1024 {
191// MEMREF_REGION-NEXT:     affine.for %{{.*}} = 0 to 1024 {
192//      MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
193// MEMREF_REGION-NOT: dealloc
194// MEMREF_REGION-NEXT: return
195
196// -----
197
198// This pattern typically appears with tiling with tile sizes that don't divide
199// the loop trip counts.
200
201#map_ub = affine_map<(d0) -> (4096, d0 + 100)>
202
203// CHECK-DAG: [[$MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
204// CHECK-DAG: [[$MAP_MIN_UB1:map[0-9]+]] = affine_map<(d0) -> (d0 + 100, 4096)>
205// CHECK-DAG: [[$MAP_MIN_UB2:map[0-9]+]] = affine_map<(d0) -> (4096, d0 + 100)>
206
207// CHECK-LABEL: func @min_upper_bound
208func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
209  affine.for %i = 0 to 4096 step 100 {
210    affine.for %ii = affine_map<(d0) -> (d0)>(%i) to min #map_ub(%i) {
211      %5 = affine.load %A[%ii] : memref<4096xf32>
212      %6 = mulf %5, %5 : f32
213      affine.store %6, %A[%ii] : memref<4096xf32>
214    }
215  }
216  return %A : memref<4096xf32>
217}
218// CHECK:      affine.for %[[IV1:.*]] = 0 to 4096 step 100
219// CHECK:        %[[BUF:.*]] = alloc() : memref<100xf32>
220// CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB1]](%[[IV1]]) {
221// CHECK-NEXT:     affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32>
222// CHECK-NEXT:     affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
223// CHECK-NEXT:   }
224// CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB2]](%[[IV1]]) {
225// CHECK-NEXT:     affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
226// CHECK-NEXT:     mulf
227// CHECK-NEXT:     affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
228// CHECK-NEXT:   }
229// CHECK:        affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB1]](%[[IV1]]) {
230// CHECK-NEXT:     affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
231// CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[%[[IV2]]] : memref<4096xf32>
232// CHECK-NEXT:   }
233// CHECK-NEXT:   dealloc %[[BUF]] : memref<100xf32>
234// CHECK-NEXT: }
235
236// -----
237
238// Lower bound is a max; upper bound is a min. This pattern typically appears
239// with multi-level tiling when the tile sizes used don't divide loop trip
240// counts.
241
242#lb = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
243#ub = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>
244
245// CHECK-DAG: #[[$LB:.*]] = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
246// CHECK-DAG: #[[$UB:.*]] = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>
247
248// CHECK-LABEL: max_lower_bound(%{{.*}}: memref<2048x516xf64>,
249// CHECK-SAME: [[i:arg[0-9]+]]
250// CHECK-SAME: [[j:arg[0-9]+]]
251func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) {
252  affine.for %ii = 0 to 2048 {
253    affine.for %jj = max #lb()[%i, %j] to min #ub()[%i, %j] {
254      affine.load %M[%ii, %jj] : memref<2048x516xf64>
255    }
256  }
257  return
258}
259
260// CHECK:      %[[BUF:.*]] = alloc() : memref<2048x6xf64>
261// CHECK-NEXT: affine.for %[[ii:.*]] = 0 to 2048 {
262// CHECK-NEXT:   affine.for %[[jj:.*]] = max #[[$LB]]()[%[[i]], %[[j]]] to min #[[$UB]]()[%[[i]], %[[j]]] {
263// CHECK-NEXT:      affine.load %{{.*}}[%[[ii]], %[[jj]]] : memref<2048x516xf64>
264// CHECK-NEXT:      affine.store %{{.*}}, %[[BUF]][%[[ii]], %[[jj]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
265// CHECK-NEXT:   }
266// CHECK-NEXT: }
267// CHECK-NEXT: affine.for %[[ii_:.*]] = 0 to 2048 {
268// CHECK-NEXT:   affine.for %[[jj_:.*]] = max #[[$LB]]()[%{{.*}}, %{{.*}}] to min #[[$UB]]()[%{{.*}}, %{{.*}}] {
269// CHECK-NEXT:     affine.load %[[BUF]][%[[ii_]], %[[jj_]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
270// CHECK-NEXT:    }
271// CHECK-NEXT: }
272// CHECK-NEXT: dealloc %[[BUF]] : memref<2048x6xf64>
273