1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_
18 
19 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
20 
21 // TODO(b/112957171): Extract logic to determine fusibility of HLO ops from
22 // GpuInstructionFusion, FusionMerger, and GpuMultiOutputFusion.
23 
24 namespace xla {
25 namespace gpu {
26 
27 constexpr int64 kMaxOperandsAndOutputsPerFusion = 64;
28 
29 bool IsInputFusible(const HloInstruction& instr);
30 
31 bool IsLoopFusible(const HloInstruction& instr);
32 
33 // The code emitted for reduce-rooted input fusions (EmitReductionToVector)
34 // suffers from poor data locality if the layouts of input parameters differ. In
35 // such situations it is better not to fuse. Only input params with
36 // maximum rank are considered. Params with smaller ranks will be broadcasted
37 // and have not been observed to cause data locality issues.
38 // TODO(b/111977086): Improve reduce emitters to remove this limitation.
39 bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
40                                          const HloInstruction& reduce);
41 
42 // Note that reduction ops are lowered in different ways. Reduce input fusions
43 // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
44 // reduction-to-vector ops. Other reduction ops are lowered by
45 // GpuElementalIrEmitter and fused like elementwise ops.
46 
47 // Whether `instr` is an input fusion rooted at a reduction-to-vector op or a
48 // multi-output input fusion with at least one reduction-to-vector op root.
49 bool IsReduceInputFusion(const HloInstruction& instr);
50 
51 // Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
52 // is either an unfused reduction-to-vector op or a reduce input fusion.
53 bool IsInputFusibleReduction(const HloInstruction& instr);
54 
55 // Whether `instr` is fusible as root of a scatter input fusions, i.e. `instr`
56 // is either an unfused scatter op or a scatter input fusion.
57 bool IsInputFusibleScatter(const HloInstruction& instr);
58 
59 // Determines whether the combination of `instr1` and `instr2` into a (possibly
60 // multi-output) fusion would be "too large" -- i.e., have more operands and
61 // outputs than is allowed or occupy too much shared memory.
62 // If the fusion is a producer/consumer fusion and instr1 is the
63 // consumer and instr2 is the producer, set consumer_producer_fusion
64 // to true to enable more fusion.
65 bool FusionWouldBeTooLarge(const HloInstruction& instr1,
66                            const HloInstruction& instr2,
67                            bool is_consumer_producer_fusion = false);
68 
69 // Check if fusing producer and consumer will generate a nested loop, e.g. both
70 // producer and consumer are `reduce-window` HLO instructions.
71 bool CreatesNestedLoop(const HloInstruction& producer,
72                        const HloInstruction& consumer);
73 
74 // Returns the instruction that determines the emitter used for lowering,
75 // sometimes referred to as "the real hero".
76 const HloInstruction* GetRealHeroForMultiOutputFusion(
77     const HloInstruction& instr);
78 
79 // Whether instruction shapes are compatible for multi-output fusion, i.e.
80 // whether the emitters support lowering the resulting fusion.
81 // This function works for both, sibling and producer-consumer multi-output
82 // fusion.
83 // So far, multi-output fusion is supported for loop fusions and reduce
84 // input fusions only. It is up to the caller to ensure the instructions
85 // themselves are fusible!
86 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
87                                           const HloInstruction& instr2);
88 
89 // Whether the instructions are compatible for producer-consumer fusion
90 // i.e. whether the producer and consumer are loop/input fusible and
91 // they are not library calls.
92 bool IsProducerConsumerFusible(const HloInstruction& producer,
93                                const HloInstruction& consumer);
94 
95 // Whether the instructions are producer-consumer fusible with multiple outputs.
96 // That is, the root tuple of the multi-output fusion will contain the results
97 // of both, the producer and consumer.
98 bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer,
99                                           const HloInstruction& consumer);
100 // Whether `instr` is a candidate for sibling fusion or as a consumer in
101 // a producer-consumer multi-output fusion.
102 bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);
103 
104 // Determines the fusion kind to be used when fusing `producer` and `consumer`.
105 HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& producer,
106                                             const HloInstruction& consumer);
107 
108 // Returns whether `consumer` is the only non-root user of `instr`.
109 bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
110                                   const HloInstruction& consumer);
111 
112 }  // namespace gpu
113 }  // namespace xla
114 
115 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_
116