1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_ 18 19 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 20 21 // TODO(b/112957171): Extract logic to determine fusibility of HLO ops from 22 // GpuInstructionFusion, FusionMerger, and GpuMultiOutputFusion. 23 24 namespace xla { 25 namespace gpu { 26 27 // Whether 'instr' can occur inside fusions, i.e. whether it is a candidate 28 // for being fused. Note that further restrictions apply, e.g. Scatter must 29 // be the root of an input fusion. 30 bool IsFusible(const HloInstruction& instr); 31 32 bool IsInputFusible(const HloInstruction& instr); 33 34 bool IsLoopFusible(const HloInstruction& instr); 35 36 // The code emitted for reduce-rooted input fusions (EmitReductionToVector) 37 // suffers from poor data locality if the layouts of input parameters differ. In 38 // such situtations it is better not to fuse. Only input params with 39 // maximum rank are considered. Params with smaller ranks will be broadcasted 40 // and have not been observed to cause data locality issues. 41 // TODO(b/111977086): Improve reduce emitters to remove this limitation. 42 bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer, 43 const HloInstruction& reduce); 44 45 // Note that reduction ops are lowered in different ways. Reduce input fusions 46 // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at 47 // reduction-to-vector ops. Other reduction ops are lowered by 48 // GpuElementalIrEmitter and fused like elementwise ops. 49 50 // Whether `instr` is an input fusion rooted at a reduction-to-vector op or a 51 // multi-output input fusion with at least one reduction-to-vector op root. 52 bool IsReduceInputFusion(const HloInstruction& instr); 53 54 // Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr` 55 // is either an unfused reduction-to-vector op or a reduce input fusion. 56 bool IsInputFusibleReduction(const HloInstruction& instr); 57 58 // Whether `instr` is fusible as root of a scatter input fusions, i.e. `instr` 59 // is either an unfused scatter op or a scatter input fusion. 60 bool IsInputFusibleScatter(const HloInstruction& instr); 61 62 // Whether instruction shapes are compatible for multi-output fusion, i.e. 63 // whether the emitters support lowering the resulting fusion. 64 // This function works for both, sibling and producer-consumer multi-output 65 // fusion. 66 // So far, multi-output fusion is supported for loop fusions and reduce 67 // input fusions only. It is up to the caller to ensure the instructions 68 // themselves are fusible! 69 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1, 70 const HloInstruction& instr2); 71 72 } // namespace gpu 73 } // namespace xla 74 75 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_ 76