1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
18 
19 #include <memory>
20 #include <set>
21 #include <utility>
22 #include <vector>
23 
24 #include "absl/container/flat_hash_map.h"
25 #include "absl/container/flat_hash_set.h"
26 #include "tensorflow/compiler/xla/service/buffer_value.h"
27 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
28 #include "tensorflow/compiler/xla/service/hlo.pb.h"
29 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
30 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
31 #include "tensorflow/compiler/xla/service/hlo_computation.h"
32 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
33 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
34 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
35 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
36 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
37 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
38 #include "tensorflow/compiler/xla/statusor.h"
39 
40 namespace xla {
41 
42 // Forward declare classes defined below.
43 template <typename BufferType>
44 class HeapAlgorithm;
45 template <typename BufferType>
46 class NoFragmentationStatsHeap;
47 
48 // HeapSimulator assigns buffer offsets by running a simulation of a regular
49 // memory heap with Alloc and Free calls.  It only works for completely
50 // sequential instruction sequences.  Unlike regular heaps, we have the
51 // advantage that the sequence of Alloc and Free calls is known up-front; we
52 // don't need to return the assignment of buffer offsets until the very end.
53 class HeapSimulator {
54  public:
55   // Chunk represents a contiguous piece of memory.  Each BufferValue will be
56   // associated with a chunk in the assignment result.
57   struct Chunk {
58     int64 offset;
59     int64 size;
60 
chunk_endChunk61     int64 chunk_end() const { return offset + size; }
62 
63     bool OverlapsWith(Chunk other_chunk) const;
64 
65     bool operator==(const Chunk& other) const {
66       return offset == other.offset && size == other.size;
67     }
68   };
69 
70   template <typename BufferType>
71   struct HeapResult {
72     // The assignment of buffers to chunks.
73     absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
74 
75     // The total size in bytes of the heap, containing all assigned chunks.
76     int64 heap_size = 0;
77   };
78   // Result represents the result of the heap simulation.
79   template <typename BufferType>
80   struct Result {
81     // Heap results.
82     std::vector<HeapResult<BufferType>> heap_results;
83 
84     // The total size in bytes of the heaps.
85     // heap_size == sum([hr.heap_size for hr in heap_results]).
86     int64 heap_size = 0;
87 
88     // The total size in bytes of heap fragmentation.
89     int64 fragmentation_size = 0;
90 
91     // A trace of heap simulation events.
92     HeapSimulatorTrace debug_trace;
93   };
94 
95   // The different options to be passed to the Run() APIs.
96   struct Options {
OptionsOptions97     Options()
98         : may_reuse_operand_buffers(true),
99           alloc_constants(false),
100           buffers_to_assign(nullptr) {}
101 
102     // Whether a buffer about to be Free()-ed, can be recycled for a new born
103     // one, hence collapsing Free()+Alloc() calls (default true).
104     bool may_reuse_operand_buffers;
105     // Whether to issue Alloc() and Free() calls for constants (default false).
106     bool alloc_constants;
107     // If 'buffers_to_assign' is provided, only those buffers are assigned
108     // offsets, otherwise all buffers defined by the instructions are assigned.
109     const absl::flat_hash_set<const HloValue*>* buffers_to_assign;
110   };
111 
112   // Returns the minimum memory required to compute an HLO module where all
113   // computations have been scheduled (represented by the given
114   // schedule), assuming no fragmentation.
115   static StatusOr<int64> MinimumMemoryForModule(
116       const HloSchedule& schedule,
117       const LogicalBuffer::SizeFunction& size_function);
118 
119   // Returns the minimum memory required to compute the given computation,
120   // assuming no fragmentation.
121   static StatusOr<int64> MinimumMemoryForComputation(
122       const HloComputation& computation, const HloInstructionSequence& sequence,
123       const HloAliasAnalysis& alias_analysis,
124       const LogicalBuffer::SizeFunction& size_function,
125       const absl::flat_hash_map<const HloComputation*, int64>*
126           memory_by_computation = nullptr);
127 
128   static StatusOr<int64> MinimumMemoryForComputation(
129       const HloComputation& computation, const HloInstructionSequence& sequence,
130       const HloAliasAnalysis& alias_analysis,
131       const LogicalBuffer::SizeFunction& size_function,
132       const HloSchedule* schedule);
133 
134   // Run the heap simulation with the given algorithm, assuming the given
135   // schedule, which must contain a topologically-consistent total
136   // ordering of all instructions within each computation. The result is invalid
137   // if instructions are not run in exactly this sequence.
138   //
139   // Running heap simulation on the whole module tends to save memory, compared
140   // to running on a per-computation basis, since we can re-use buffer space for
141   // called sub-computations.
142   //
143   static StatusOr<Result<HloValue>> Run(
144       std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
145       const HloModule& module, const HloSchedule& schedule,
146       const HloAliasAnalysis& alias_analysis,
147       const BufferValue::SizeFunction& size_fn,
148       const Options& options = Options());
149 
150   // Same as above, but runs on a single computation. The 'instruction_sequence'
151   // must contain a topologically-consistent total ordering of all instructions
152   // in the computation. The result is invalid if instructions are not run in
153   // exactly this sequence.
154   static StatusOr<Result<HloValue>> Run(
155       std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
156       const HloComputation& computation,
157       const HloInstructionSequence& instruction_sequence,
158       const HloAliasAnalysis& alias_analysis,
159       const BufferValue::SizeFunction& size_fn,
160       const Options& options = Options(),
161       const absl::flat_hash_map<const HloComputation*, int64>*
162           memory_by_computation = nullptr);
163 
164   // Same as above, but runs on with a schedule that covers all nested
165   // computations.
166   static StatusOr<Result<HloValue>> Run(
167       std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
168       const HloComputation& computation,
169       const HloInstructionSequence& instruction_sequence,
170       const HloAliasAnalysis& alias_analysis,
171       const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule,
172       const Options& options = Options());
173 
174  private:
175   // If 'schedule' is non-null, it is used to find kCall and kWhile
176   // sub-computations, and the heap simulation for those sub-computations will
177   // be run recursively. I.e. the simulation is run over the whole module.
178   HeapSimulator(std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
179                 const BufferValue::SizeFunction& size_fn,
180                 const Options& options, const HloSchedule* schedule = nullptr,
181                 const absl::flat_hash_map<const HloComputation*, int64>*
182                     memory_by_computation = nullptr);
183   ~HeapSimulator();
184 
185   Status RunComputation(const HloComputation& computation,
186                         const HloInstructionSequence& instruction_sequence,
187                         const HloAliasAnalysis& alias_analysis,
188                         HloLiveRange* live_range);
189 
190   bool IgnoreBuffer(const HloValue* buffer) const;
191   void Alloc(const HloValue* buffer, const HloInstruction* instruction);
192   void Free(const HloValue* buffer, const HloInstruction* instruction);
193   // ShareBuffer indicates that a new buffer is defined and it has to be the
194   // same address as the shared one.
195   void ShareBuffer(const HloValue* buffer, const HloValue* shared,
196                    const HloInstruction* instruction);
197 
198   // Returns true if:
199   //  Two buffers belong to the same shared group.
200   //  Eight of the buffer has no shared group assigned.
201   bool InSameSharedGroup(const HloValue* left, const HloValue* right);
202   Result<HloValue> Finish();
203 
204   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
205                       const HloValue* buffer, const HloInstruction* instruction,
206                       const HloValue* share_with_canonical);
207 
208   // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
209   // in which case we are calculating the same allocs/frees twice in the
210   // simulation.
211   const std::unique_ptr<NoFragmentationStatsHeap<HloValue>>
212       no_fragmentation_stats_;
213   const std::unique_ptr<HeapAlgorithm<HloValue>> algorithm_;
214   const BufferValue::SizeFunction size_fn_;
215   const Options options_;
216   // schedule_ is set by buffer assignment, and memory_by_computation_ is
217   // set by hlo scheduling. Then, in RunComputation, we check both in order to
218   // handle subcomputations. It would be good to unify the handling of
219   // subcomputations, but it's not clear how.
220   const HloSchedule* schedule_;
221   const absl::flat_hash_map<const HloComputation*, int64>*
222       memory_by_computation_;
223 
224   // Hold some sets for error-checking the sequence of Alloc and Free calls.
225   absl::flat_hash_set<const HloValue*> allocated_buffers_;
226   absl::flat_hash_set<const HloValue*> freed_buffers_;
227 
228   // Debugging information filled in while the heap simulator runs.
229   HeapSimulatorTrace debug_trace_;
230 };
231 
232 // Abstract base class describing a heap simulation algorithm that assigns
233 // offsets to buffers.  A sequence of Alloc / Free calls will be made, with the
234 // same semantics as a regular memory heap.  Finish will be called at the end to
235 // collect the simulation results.
236 template <typename BufferType>
237 class HeapAlgorithm {
238  public:
239   using Chunk = HeapSimulator::Chunk;
240   using Result = HeapSimulator::Result<BufferType>;
241   using HeapResult = HeapSimulator::HeapResult<BufferType>;
242 
243   virtual ~HeapAlgorithm() = default;
244 
245   // Alloc allocates a buffer of 'size' bytes.
246   virtual void Alloc(const BufferType* buffer, int64 size) = 0;
247 
248   // Takes memory usage of subcomputations into account when calculating the
249   // memory usage of a computation. Currently, we don't handle buffer aliasing
250   // between computations entirely correctly. We are careful to not double count
251   // for the output buffers of whiles/conds/calls. But we don't take into
252   // account other aliases, such as for the while init. A more thorough solution
253   // would require something like BufferAssignment::BuildColocatedBufferSets.
254   // TODO(b/65835246):
255   // Since TuplePointsToAnalysis is being replaced with a module-aware alias
256   // analysis, it's not worth making major changes to HeapSimulator now.
AccountForSubcomputationMemory(const HloInstruction * instruction,int64 alloc_size_by_instruction,const absl::flat_hash_map<const HloComputation *,int64> & memory_by_computation)257   virtual void AccountForSubcomputationMemory(
258       const HloInstruction* instruction,
259       // The total number of bytes allocated by instruction.
260       int64 alloc_size_by_instruction,
261       const absl::flat_hash_map<const HloComputation*, int64>&
262           memory_by_computation) {}
263 
264   // Free de-allocates a previously allocated buffer.
265   virtual void Free(const BufferType* buffer, int64 size) = 0;
266 
267   // Indicates that a buffer has to be collocated with another buffer. In
268   // addition to Alloc and Free, the heap simulator exposes a concept of buffer
269   // sharing.  When ShareBuffer is called, instead of allocating new space for
270   // the buffer, it associates the buffer with a previously allocated (or
271   // shared) buffer.  Each group of mutually-shared buffers points to a single
272   // SharedGroup instance, which is a shared control block.
ShareWith(const BufferType * buffer,const BufferType * share_with,int64 size)273   virtual void ShareWith(const BufferType* buffer, const BufferType* share_with,
274                          int64 size) {
275     Alloc(buffer, size);
276   }
277 
278   // Finish collects the buffer offset assignment results.  Finish may only be
279   // called once, after all Alloc and Free calls.
280   virtual Result Finish() = 0;
281 };
282 
283 // NoFragmentationStatsHeap computes the heap size assuming no fragmentation;
284 // this is the absolute minimum size for a given instruction sequence.  The
285 // result.chunk_map returned in Finish is always empty, since we only collect
286 // stats, and don't actually compute chunk assignments.
287 template <typename BufferType>
288 class NoFragmentationStatsHeap : public HeapAlgorithm<BufferType> {
289  public:
290   using Result = HeapSimulator::Result<BufferType>;
291 
292   NoFragmentationStatsHeap() = default;
293   ~NoFragmentationStatsHeap() override = default;
294 
295   void Alloc(const BufferType* buffer, int64 size) override;
296 
297   void AccountForSubcomputationMemory(
298       const HloInstruction* instruction, int64 alloc_size_by_instruction,
299       const absl::flat_hash_map<const HloComputation*, int64>&
300           memory_by_computation) override;
301 
302   void Free(const BufferType* buffer, int64 size) override;
303 
304   Result Finish() override;
305 
306  private:
307   int64 current_heap_size_ = 0;
308   int64 max_heap_size_ = 0;
309 };
310 
311 // Node in BufferIntervalTree that stores the alloc and free times of a buffer,
312 // and the chunk assigned to it.
313 struct BufferIntervalTreeNode {
314   // Alloc time.
315   int64 start;
316   // Free time.
317   int64 end;
318   // Maximum free time of all nodes in the subtree where this node is the root.
319   int64 subtree_end;
320   // Allocated chunk for the buffer.
321   HeapSimulator::Chunk chunk;
322   // Left child.
323   BufferIntervalTreeNode* left;
324   // Right child.
325   BufferIntervalTreeNode* right;
326   // parent
327   BufferIntervalTreeNode* parent;
328 };
329 
330 // An interval tree that can query buffers overlapping in time.
331 class BufferIntervalTree {
332  public:
333   using Chunk = HeapSimulator::Chunk;
334   // Adds a buffer to the interval tree, with the time interval and allocated
335   // chunk specified.
336   void Add(int64 start, int64 end, const Chunk& chunk);
337 
338   // Remove the interval from the tree. Returns true if the chunk is removed.
339   bool Remove(int64 start, int64 end, const Chunk& chunk);
340 
341   // Returns vector of allocated chunks that overlap with the given time
342   // interval.
343   std::vector<Chunk> ChunksOverlappingInTime(int64 start, int64 end) const;
344 
GetRoot()345   BufferIntervalTreeNode* GetRoot() { return root_; }
346 
347  private:
348   BufferIntervalTreeNode* root_ = nullptr;
349   std::list<BufferIntervalTreeNode> node_storage_;
350 };
351 
352 // GlobalDecreasingSizeBestFitHeap collects the live intervals of all buffers,
353 // then allocates them in decreasing spatial or temporal size regardless of the
354 // alloc/free time. It internally tracks the allocated buffers and their live
355 // intervals; when allocating a buffer, it finds the best-fit free chunk during
356 // its live interval.
357 template <typename BufferType>
358 class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
359  public:
360   using HeapResult = HeapSimulator::HeapResult<BufferType>;
361   using Result = HeapSimulator::Result<BufferType>;
362   using Chunk = HeapSimulator::Chunk;
363 
364   enum Type {
365     kSpatial = 0,
366     kTemporal,
367   };
368 
369   // BufferInterval stores a buffer's size and time interval.
370   struct BufferInterval {
371     const BufferType* buffer;
372     int64 size;
373     // Alloc time of the buffer.
374     int64 start;
375     // Free time of the buffer.
376     int64 end;
377 
378     // Colocation buffers that need to be collocated with this one.
379     std::vector<const BufferType*> colocations;
380 
381     // True if this buffer needs an allocation. False if it is collocated with
382     // other buffer.
383     bool need_allocation;
384   };
385 
386   // Comparison function that is used to store buffer intervals.
387   using BufferIntervalCompare =
388       std::function<bool(const BufferInterval&, const BufferInterval&)>;
389 
390   explicit GlobalDecreasingSizeBestFitHeap(int64 alignment,
391                                            Type type = kSpatial);
~GlobalDecreasingSizeBestFitHeap()392   ~GlobalDecreasingSizeBestFitHeap() override {}
393 
394   void Alloc(const BufferType* buffer, int64 size) override;
395   void Free(const BufferType* buffer, int64 size) override;
396 
397   void ShareWith(const BufferType* buffer, const BufferType* share_with,
398                  int64 size) override;
399 
400   Result Finish() override;
401 
402   // Return a BufferIntervalCompare function that sort by spatial size. We don't
403   // look at co-locates as they should have the same size.
404   static BufferIntervalCompare GetSpatialBufferIntervalCompare();
405 
406  protected:
407   // The candidate contains a chunk and the resultant heap size if this
408   // chunk is to be committed.
409   struct ChunkCandidate {
410     Chunk chunk;
411     int64 heap_size;
412   };
413 
414   // Returns the buffer intervals sorted according to buffer_interval_compare_.
415   std::vector<BufferInterval> GetSortedBufferIntervals() const;
416 
417   // These two methods below are exposed to other heap algorithms that inherit
418   // from this class. The Finish() method tries to find a candidate chunk for
419   // each BufferInterval, after calling GetSortedBufferIntervals. If a
420   // non-negative preferred_offset is provided, FindChunkCandidate attempts
421   // finding a chunk at this offset. The ChunkCandidate returns the chunk and
422   // the final heap size if it chunk is to be committed. The Finish() method can
423   // then call CommitChunk to associate the chunk with the BufferInterval, if
424   // the final heap size is within the limits.
425   ChunkCandidate FindChunkCandidate(const BufferInterval& buffer_interval,
426                                     int64 preferred_offset = -1) const;
427   void CommitChunk(const BufferInterval& buffer_interval,
428                    ChunkCandidate chunk_candidate);
429 
430   // Adds the buffer and the chunk to the result chunk map.
431   virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
432 
433   // Return a BufferIntervalCompare function that sorts by live ranges.  A live
434   // range is defined by the range between the start of the first buffer and the
435   // end of the last co-located buffer.  There could be "holes" in the live
436   // ranges of each co-located buffers, but in this heuristics we think they are
437   // contiguous.
438   BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
439 
440   absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
441   HeapResult result_;
442   BufferIntervalCompare buffer_interval_compare_;
443   BufferIntervalTree interval_tree_;
444 
445  private:
446   int64 alignment_;
447 
448   // The current time represented as an integer. It increments by 1 at each
449   // Alloc or Free call.
450   int64 current_time_ = 0;
451 
452   // Returns all transitive colocated buffers of this buffer interval. I.e., If
453   // a buffer A is colocated with B and B is colocated with C, this function
454   // returns all three of them.
455   absl::flat_hash_set<const BufferType*> GetTransitiveColocations(
456       const BufferInterval& interval) const;
457 };
458 
459 // This class implements an algorithm that will produce multiple heaps, where
460 // each heap size is constrained by a given limit. Note that the constraint is
461 // soft, meaning that a valid heap result is generated even if there are some
462 // buffer sizes larger than the given constraint size.
463 //
464 // Pseudocode:
465 //   while( `buffers` is not empty ) {
466 //     create a new heap `h`
467 //     for (each buffer `buf` in `buffers` in the size-decreasing order) {
468 //       if (buf.size() is larger than the heap size limit &&
469 //           `h` is empty) {
470 //         h.place(buf)
471 //         buffers.remove(buf)
472 //       } else if (placing `buf` into `h` does not violate size
473 //           constraint) {
474 //         h.place(buf)
475 //         buffers.remove(buf)
476 //       }
477 //     }
478 //   }
479 class ConstrainedGlobalDecreasingSizeBestFitHeap
480     : public GlobalDecreasingSizeBestFitHeap<HloValue> {
481  public:
482   explicit ConstrainedGlobalDecreasingSizeBestFitHeap(
483       uint64 size_limit_per_heap, int64 alignment, Type type = kSpatial)
484       : GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type),
485         size_limit_per_heap_(size_limit_per_heap) {}
~ConstrainedGlobalDecreasingSizeBestFitHeap()486   ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {}
487 
488   Result Finish() override;
489 
490  private:
491   uint64 size_limit_per_heap_;
492 };
493 
494 // A heap algorithm that chooses the best results from other algorithms added to
495 // it.
496 template <typename BufferType>
497 class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> {
498  public:
499   using Result = HeapSimulator::Result<BufferType>;
500 
ChooseBestHeapAlgorithm(std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>> algorithms)501   ChooseBestHeapAlgorithm(
502       std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>>
503           algorithms)
504       : algorithms_(std::move(*algorithms)) {}
~ChooseBestHeapAlgorithm()505   ~ChooseBestHeapAlgorithm() override {}
506 
Alloc(const BufferType * buffer,int64 size)507   void Alloc(const BufferType* buffer, int64 size) override {
508     for (auto& algorithm : algorithms_) {
509       algorithm->Alloc(buffer, size);
510     }
511   }
512 
ShareWith(const BufferType * buffer,const BufferType * share_with,int64 size)513   void ShareWith(const BufferType* buffer, const BufferType* share_with,
514                  int64 size) override {
515     for (auto& algorithm : algorithms_) {
516       algorithm->ShareWith(buffer, share_with, size);
517     }
518   }
519 
Free(const BufferType * buffer,int64 size)520   void Free(const BufferType* buffer, int64 size) override {
521     for (auto& algorithm : algorithms_) {
522       algorithm->Free(buffer, size);
523     }
524   }
525 
526   Result Finish() override;
527 
528  private:
529   std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_;
530 };
531 
532 }  // namespace xla
533 
534 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
535