1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/math_ops.cc. 17 18 #ifdef INTEL_MKL 19 #define EIGEN_USE_THREADS 20 21 #include <numeric> 22 #include "tensorflow/core/framework/numeric_op.h" 23 #include "tensorflow/core/framework/register_types.h" 24 #include "tensorflow/core/lib/gtl/inlined_vector.h" 25 #include "tensorflow/core/platform/logging.h" 26 27 #include "mkldnn.hpp" 28 #include "tensorflow/core/util/mkl_util.h" 29 using mkldnn::stream; 30 using mkldnn::sum; 31 32 namespace tensorflow { 33 typedef Eigen::ThreadPoolDevice CPUDevice; 34 35 template <typename Device, typename T> 36 class MklAddNOp : public OpKernel { 37 public: ~MklAddNOp()38 ~MklAddNOp() {} MklAddNOp(OpKernelConstruction * context)39 explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {} 40 Compute(OpKernelContext * ctx)41 void Compute(OpKernelContext* ctx) override { 42 const int num = ctx->num_inputs(); 43 // Only additions of 2 input tensors is supported now 44 OP_REQUIRES(ctx, num / 2 == 2, 45 errors::InvalidArgument("Only additions of two tensors " 46 "supported by MKL. Num inputs: ", 47 num)); 48 49 try { 50 auto cpu_engine = engine(engine::cpu, 0); 51 size_t src1_idx = 0, src2_idx = 1, output_idx = 0; 52 const Tensor& src1_tensor = MklGetInput(ctx, src1_idx); 53 const Tensor& src2_tensor = MklGetInput(ctx, src2_idx); 54 55 MklDnnShape src1_mkl_shape, src2_mkl_shape; 56 GetMklShape(ctx, src1_idx, &src1_mkl_shape); 57 GetMklShape(ctx, src2_idx, &src2_mkl_shape); 58 bool input1_in_mkl_format = src1_mkl_shape.IsMklTensor(); 59 bool input2_in_mkl_format = src2_mkl_shape.IsMklTensor(); 60 int src1_dims_size = input1_in_mkl_format ? src1_mkl_shape.GetDimension() 61 : src1_tensor.dims(); 62 int src2_dims_size = input2_in_mkl_format ? src2_mkl_shape.GetDimension() 63 : src2_tensor.dims(); 64 // if the shapes of two tensors are not same raise op error 65 TensorShape src1_shape, src2_shape; 66 src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape() 67 : src1_tensor.shape(); 68 src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape() 69 : src2_tensor.shape(); 70 71 if (!src1_shape.IsSameSize(src2_shape)) { 72 ctx->SetStatus(errors::InvalidArgument( 73 "Inputs to operation ", this->name(), " of type ", 74 this->type_string(), 75 " must have the same size and shape. Input 0: ", 76 src1_shape.DebugString(), 77 " != input 1: ", src2_shape.DebugString())); 78 } 79 80 if (!input1_in_mkl_format && src1_dims_size == 0) { 81 Tensor* dst_tensor = nullptr; 82 MklDnnShape mkl_shape_dst; 83 mkl_shape_dst.SetMklTensor(false); 84 AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, 85 src1_tensor.shape(), mkl_shape_dst); 86 float user_i1 = (src1_tensor.scalar<T>()()); 87 float user_i2 = (src2_tensor.scalar<T>()()); 88 dst_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2); 89 return; 90 } 91 92 // If there is nothing to compute, return. 93 if (!input1_in_mkl_format && !input2_in_mkl_format) { 94 if (src1_tensor.shape().num_elements() == 0) { 95 Tensor* dst_tensor = nullptr; 96 MklDnnShape mkl_shape_dst; 97 mkl_shape_dst.SetMklTensor(false); 98 AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, 99 src1_tensor.shape(), mkl_shape_dst); 100 return; 101 } 102 } 103 104 const std::vector<float> coeff(2, 1.0f); 105 MklDnnData<T> src1(&cpu_engine); 106 MklDnnData<T> src2(&cpu_engine); 107 MklDnnData<T> dst(&cpu_engine); 108 109 int tmp_size = input1_in_mkl_format ? src2_dims_size : src1_dims_size; 110 memory::dims dims(tmp_size); 111 memory::dims strides(tmp_size); 112 memory::desc md1({}, memory::data_undef, memory::format_undef); 113 memory::desc md2({}, memory::data_undef, memory::format_undef); 114 115 // For creating Sum primitive, we need to ensure that all inputs are in 116 // same format. What that means is if we have a mixed input case - where 117 // one input is in Tensorflow format and one input is in MKL format -, 118 // then we need to ensure that all inputs are in same format for 119 // primitive construction. For performance reason, we say that all inputs 120 // are in MKL format in such case, and insert reorder for input that is 121 // in Tensorflow format into MKL format. On the other hand, if both the 122 // inputs are in MKL format or both are in Tensorflow format, then we 123 // dont need reorder. 124 if (!input1_in_mkl_format && !input2_in_mkl_format) { 125 // If both the inputs are in Tensorflow format, we create blocked memory 126 // descriptor. 127 dims = TFShapeToMklDnnDims(src1_tensor.shape()); 128 strides = CalculateTFStrides(dims); 129 md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides); 130 md2 = md1; 131 } else if (input1_in_mkl_format && !input2_in_mkl_format) { 132 // If one input is in MKL format and other is in Tensorflow, then 133 // create respective descriptors describing the actual case. For input 134 // in Mkl format, we just get Mkl layout from MklDnnShape. For input in 135 // Tensorflow format, we create memory descriptor using data format. 136 md1 = src1_mkl_shape.GetMklLayout(); 137 138 memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat(); 139 auto src1_tf_data_format = 140 MklDnnDataFormatToTFDataFormat(src1_mkl_data_format); 141 memory::dims src2_dims; 142 if (src2_tensor.dims() == 4) { 143 src2_dims = TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(), 144 src1_tf_data_format); 145 } else { 146 src2_dims = TFShapeToMklDnnDimsInNCDHW(src2_tensor.shape(), 147 src1_tf_data_format); 148 } 149 md2 = memory::desc(src2_dims, MklDnnType<T>(), src1_mkl_data_format); 150 } else if (input2_in_mkl_format && !input1_in_mkl_format) { 151 // Same comment as above. 152 memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat(); 153 auto src2_tf_data_format = 154 MklDnnDataFormatToTFDataFormat(src2_mkl_data_format); 155 memory::dims src1_dims; 156 if (src1_tensor.dims() == 4) { 157 src1_dims = TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(), 158 src2_tf_data_format); 159 } else { 160 src1_dims = TFShapeToMklDnnDimsInNCDHW(src1_tensor.shape(), 161 src2_tf_data_format); 162 } 163 md1 = memory::desc(src1_dims, MklDnnType<T>(), src2_mkl_data_format); 164 165 md2 = src2_mkl_shape.GetMklLayout(); 166 } else { 167 // If both the inputs are in MKL format, we use Mkl layout of the input 168 // tensors. 169 md1 = src1_mkl_shape.GetMklLayout(); 170 md2 = src2_mkl_shape.GetMklLayout(); 171 } 172 src1.SetUsrMem(md1, &src1_tensor); 173 src2.SetUsrMem(md2, &src2_tensor); 174 175 // As per comment above, we tell MKLDNN that both the inputs are in same 176 // format. So we set common memory descriptor in MKL format, if any of the 177 // inputs are in MKL format. Let's get memory descriptor that we will use 178 // for both the inputs. 179 // We set output memory descriptor in MKL format, if any of the 180 // inputs are in MKL format. 181 memory::desc common_md({}, memory::data_undef, memory::format_undef); 182 if (input1_in_mkl_format || input2_in_mkl_format) { 183 common_md = input1_in_mkl_format ? md1 : md2; 184 dst.SetUsrMem(common_md); 185 } else { 186 // Since both the inputs are in Tensorflow format, and have 187 // same shape, we can get memory descriptor from any input. 188 common_md = md1; 189 dst.SetUsrMem(common_md); 190 } 191 192 std::vector<memory::primitive_desc> srcs_pd; 193 // Memory descriptor for 1st input 194 srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine)); 195 // Memory descriptor for 2nd input 196 srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine)); 197 auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd); 198 199 // Now we setup resources for primitive execution. 200 // First, we need to check if any of the inputs need to be reordered as 201 // per the logic described above. Since output will be in MKL format if 202 // atleast one input is in MKL format, we choose output descriptor for 203 // reorder. 204 std::vector<primitive::at> inputs; 205 // Check if actual input format of the tensor is different than common_pd 206 // we told MKLDNN. In that case, we will need reorder. 207 src1.CheckReorderToOpMem(srcs_pd[0]); 208 src2.CheckReorderToOpMem(srcs_pd[1]); 209 inputs.push_back(src1.GetOpMem()); 210 inputs.push_back(src2.GetOpMem()); 211 212 // Allocate output tensor now. 213 Tensor* dst_tensor = nullptr; 214 MklDnnShape output_mkl_shape; 215 TensorShape output_tf_shape; 216 217 if (input2_in_mkl_format || input1_in_mkl_format) { 218 output_mkl_shape.SetMklTensor(true); 219 auto output_pd = dst.GetUsrMemPrimDesc(); 220 output_mkl_shape.SetMklLayout(&output_pd); 221 output_mkl_shape.SetElemType(MklDnnType<T>()); 222 if (input1_in_mkl_format) { 223 output_mkl_shape.SetTfLayout(src1_dims_size, 224 src1_mkl_shape.GetSizesAsMklDnnDims(), 225 src1_mkl_shape.GetTfDataFormat()); 226 } else { 227 output_mkl_shape.SetTfLayout(src2_dims_size, 228 src2_mkl_shape.GetSizesAsMklDnnDims(), 229 src2_mkl_shape.GetTfDataFormat()); 230 } 231 output_tf_shape.AddDim((output_pd.get_size() / sizeof(T))); 232 } else { 233 output_mkl_shape.SetMklTensor(false); 234 output_tf_shape = src1_tensor.shape(); 235 } 236 AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, output_tf_shape, 237 output_mkl_shape); 238 dst.SetUsrMemDataHandle(dst_tensor); 239 240 // Create Sum op, and submit net for execution. 241 std::vector<primitive> net; 242 net.push_back(sum(sum_pd, inputs, dst.GetOpMem())); 243 stream(stream::kind::eager).submit(net).wait(); 244 } catch (mkldnn::error& e) { 245 string error_msg = "Status: " + std::to_string(e.status) + 246 ", message: " + string(e.message) + ", in file " + 247 string(__FILE__) + ":" + std::to_string(__LINE__); 248 OP_REQUIRES_OK( 249 ctx, errors::Aborted("Operation received an exception:", error_msg)); 250 } 251 } 252 }; 253 254 #define REGISTER_MKL_CPU(T) \ 255 REGISTER_KERNEL_BUILDER(Name("_MklAddN") \ 256 .Device(DEVICE_CPU) \ 257 .TypeConstraint<T>("T") \ 258 .Label(mkl_op_registry::kMklOpLabel), \ 259 MklAddNOp<CPUDevice, T>); 260 261 TF_CALL_float(REGISTER_MKL_CPU); 262 #undef REGISTER_MKL_CPU 263 } // namespace tensorflow 264 #endif // INTEL_MKL 265