1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/math_ops.cc.
17 
18 #ifdef INTEL_MKL
19 #define EIGEN_USE_THREADS
20 
21 #include <numeric>
22 #include "tensorflow/core/framework/numeric_op.h"
23 #include "tensorflow/core/framework/register_types.h"
24 #include "tensorflow/core/lib/gtl/inlined_vector.h"
25 #include "tensorflow/core/platform/logging.h"
26 
27 #include "mkldnn.hpp"
28 #include "tensorflow/core/util/mkl_util.h"
29 using mkldnn::stream;
30 using mkldnn::sum;
31 
32 namespace tensorflow {
33 typedef Eigen::ThreadPoolDevice CPUDevice;
34 
35 template <typename Device, typename T>
36 class MklAddNOp : public OpKernel {
37  public:
~MklAddNOp()38   ~MklAddNOp() {}
MklAddNOp(OpKernelConstruction * context)39   explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {}
40 
Compute(OpKernelContext * ctx)41   void Compute(OpKernelContext* ctx) override {
42     const int num = ctx->num_inputs();
43     // Only additions of 2 input tensors is supported now
44     OP_REQUIRES(ctx, num / 2 == 2,
45                 errors::InvalidArgument("Only additions of two tensors "
46                                         "supported by MKL. Num inputs: ",
47                                         num));
48 
49     try {
50       auto cpu_engine = engine(engine::cpu, 0);
51       size_t src1_idx = 0, src2_idx = 1, output_idx = 0;
52       const Tensor& src1_tensor = MklGetInput(ctx, src1_idx);
53       const Tensor& src2_tensor = MklGetInput(ctx, src2_idx);
54 
55       MklDnnShape src1_mkl_shape, src2_mkl_shape;
56       GetMklShape(ctx, src1_idx, &src1_mkl_shape);
57       GetMklShape(ctx, src2_idx, &src2_mkl_shape);
58       bool input1_in_mkl_format = src1_mkl_shape.IsMklTensor();
59       bool input2_in_mkl_format = src2_mkl_shape.IsMklTensor();
60       int src1_dims_size = input1_in_mkl_format ? src1_mkl_shape.GetDimension()
61                                                 : src1_tensor.dims();
62       int src2_dims_size = input2_in_mkl_format ? src2_mkl_shape.GetDimension()
63                                                 : src2_tensor.dims();
64       // if the shapes of two tensors are not same raise op error
65       TensorShape src1_shape, src2_shape;
66       src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape()
67                                         : src1_tensor.shape();
68       src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape()
69                                         : src2_tensor.shape();
70 
71       if (!src1_shape.IsSameSize(src2_shape)) {
72         ctx->SetStatus(errors::InvalidArgument(
73             "Inputs to operation ", this->name(), " of type ",
74             this->type_string(),
75             " must have the same size and shape.  Input 0: ",
76             src1_shape.DebugString(),
77             " != input 1: ", src2_shape.DebugString()));
78       }
79 
80       if (!input1_in_mkl_format && src1_dims_size == 0) {
81         Tensor* dst_tensor = nullptr;
82         MklDnnShape mkl_shape_dst;
83         mkl_shape_dst.SetMklTensor(false);
84         AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
85                                   src1_tensor.shape(), mkl_shape_dst);
86         float user_i1 = (src1_tensor.scalar<T>()());
87         float user_i2 = (src2_tensor.scalar<T>()());
88         dst_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
89         return;
90       }
91 
92       // If there is nothing to compute, return.
93       if (!input1_in_mkl_format && !input2_in_mkl_format) {
94         if (src1_tensor.shape().num_elements() == 0) {
95           Tensor* dst_tensor = nullptr;
96           MklDnnShape mkl_shape_dst;
97           mkl_shape_dst.SetMklTensor(false);
98           AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
99                                     src1_tensor.shape(), mkl_shape_dst);
100           return;
101         }
102       }
103 
104       const std::vector<float> coeff(2, 1.0f);
105       MklDnnData<T> src1(&cpu_engine);
106       MklDnnData<T> src2(&cpu_engine);
107       MklDnnData<T> dst(&cpu_engine);
108 
109       int tmp_size = input1_in_mkl_format ? src2_dims_size : src1_dims_size;
110       memory::dims dims(tmp_size);
111       memory::dims strides(tmp_size);
112       memory::desc md1({}, memory::data_undef, memory::format_undef);
113       memory::desc md2({}, memory::data_undef, memory::format_undef);
114 
115       // For creating Sum primitive, we need to ensure that all inputs are in
116       // same format. What that means is if we have a mixed input case - where
117       // one input is in Tensorflow format and one input is in MKL format -,
118       // then we need to ensure that all inputs are in same format for
119       // primitive construction. For performance reason, we say that all inputs
120       // are in MKL format in such case, and insert reorder for input that is
121       // in Tensorflow format into MKL format. On the other hand, if both the
122       // inputs are in MKL format or both are in Tensorflow format, then we
123       // dont need reorder.
124       if (!input1_in_mkl_format && !input2_in_mkl_format) {
125         // If both the inputs are in Tensorflow format, we create blocked memory
126         // descriptor.
127         dims = TFShapeToMklDnnDims(src1_tensor.shape());
128         strides = CalculateTFStrides(dims);
129         md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
130         md2 = md1;
131       } else if (input1_in_mkl_format && !input2_in_mkl_format) {
132         // If one input is in MKL format and other is in Tensorflow, then
133         // create respective descriptors describing the actual case. For input
134         // in Mkl format, we just get Mkl layout from MklDnnShape. For input in
135         // Tensorflow format, we create memory descriptor using data format.
136         md1 = src1_mkl_shape.GetMklLayout();
137 
138         memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat();
139         auto src1_tf_data_format =
140             MklDnnDataFormatToTFDataFormat(src1_mkl_data_format);
141         memory::dims src2_dims;
142         if (src2_tensor.dims() == 4) {
143           src2_dims = TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(),
144                                                 src1_tf_data_format);
145         } else {
146           src2_dims = TFShapeToMklDnnDimsInNCDHW(src2_tensor.shape(),
147                                                  src1_tf_data_format);
148         }
149         md2 = memory::desc(src2_dims, MklDnnType<T>(), src1_mkl_data_format);
150       } else if (input2_in_mkl_format && !input1_in_mkl_format) {
151         // Same comment as above.
152         memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat();
153         auto src2_tf_data_format =
154             MklDnnDataFormatToTFDataFormat(src2_mkl_data_format);
155         memory::dims src1_dims;
156         if (src1_tensor.dims() == 4) {
157           src1_dims = TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(),
158                                                 src2_tf_data_format);
159         } else {
160           src1_dims = TFShapeToMklDnnDimsInNCDHW(src1_tensor.shape(),
161                                                  src2_tf_data_format);
162         }
163         md1 = memory::desc(src1_dims, MklDnnType<T>(), src2_mkl_data_format);
164 
165         md2 = src2_mkl_shape.GetMklLayout();
166       } else {
167         // If both the inputs are in MKL format, we use Mkl layout of the input
168         // tensors.
169         md1 = src1_mkl_shape.GetMklLayout();
170         md2 = src2_mkl_shape.GetMklLayout();
171       }
172       src1.SetUsrMem(md1, &src1_tensor);
173       src2.SetUsrMem(md2, &src2_tensor);
174 
175       // As per comment above, we tell MKLDNN that both the inputs are in same
176       // format. So we set common memory descriptor in MKL format, if any of the
177       // inputs are in MKL format. Let's get memory descriptor that we will use
178       // for both the inputs.
179       // We set output memory descriptor in MKL format, if any of the
180       // inputs are in MKL format.
181       memory::desc common_md({}, memory::data_undef, memory::format_undef);
182       if (input1_in_mkl_format || input2_in_mkl_format) {
183         common_md = input1_in_mkl_format ? md1 : md2;
184         dst.SetUsrMem(common_md);
185       } else {
186         // Since both the inputs are in Tensorflow format, and have
187         // same shape, we can get memory descriptor from any input.
188         common_md = md1;
189         dst.SetUsrMem(common_md);
190       }
191 
192       std::vector<memory::primitive_desc> srcs_pd;
193       // Memory descriptor for 1st input
194       srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine));
195       // Memory descriptor for 2nd input
196       srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine));
197       auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
198 
199       // Now we setup resources for primitive execution.
200       // First, we need to check if any of the inputs need to be reordered as
201       // per the logic described above. Since output will be in MKL format if
202       // atleast one input is in MKL format, we choose output descriptor for
203       // reorder.
204       std::vector<primitive::at> inputs;
205       // Check if actual input format of the tensor is different than common_pd
206       // we told MKLDNN. In that case, we will need reorder.
207       src1.CheckReorderToOpMem(srcs_pd[0]);
208       src2.CheckReorderToOpMem(srcs_pd[1]);
209       inputs.push_back(src1.GetOpMem());
210       inputs.push_back(src2.GetOpMem());
211 
212       // Allocate output tensor now.
213       Tensor* dst_tensor = nullptr;
214       MklDnnShape output_mkl_shape;
215       TensorShape output_tf_shape;
216 
217       if (input2_in_mkl_format || input1_in_mkl_format) {
218         output_mkl_shape.SetMklTensor(true);
219         auto output_pd = dst.GetUsrMemPrimDesc();
220         output_mkl_shape.SetMklLayout(&output_pd);
221         output_mkl_shape.SetElemType(MklDnnType<T>());
222         if (input1_in_mkl_format) {
223           output_mkl_shape.SetTfLayout(src1_dims_size,
224                                        src1_mkl_shape.GetSizesAsMklDnnDims(),
225                                        src1_mkl_shape.GetTfDataFormat());
226         } else {
227           output_mkl_shape.SetTfLayout(src2_dims_size,
228                                        src2_mkl_shape.GetSizesAsMklDnnDims(),
229                                        src2_mkl_shape.GetTfDataFormat());
230         }
231         output_tf_shape.AddDim((output_pd.get_size() / sizeof(T)));
232       } else {
233         output_mkl_shape.SetMklTensor(false);
234         output_tf_shape = src1_tensor.shape();
235       }
236       AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, output_tf_shape,
237                                 output_mkl_shape);
238       dst.SetUsrMemDataHandle(dst_tensor);
239 
240       // Create Sum op, and submit net for execution.
241       std::vector<primitive> net;
242       net.push_back(sum(sum_pd, inputs, dst.GetOpMem()));
243       stream(stream::kind::eager).submit(net).wait();
244     } catch (mkldnn::error& e) {
245       string error_msg = "Status: " + std::to_string(e.status) +
246                          ", message: " + string(e.message) + ", in file " +
247                          string(__FILE__) + ":" + std::to_string(__LINE__);
248       OP_REQUIRES_OK(
249           ctx, errors::Aborted("Operation received an exception:", error_msg));
250     }
251   }
252 };
253 
254 #define REGISTER_MKL_CPU(T)                                         \
255   REGISTER_KERNEL_BUILDER(Name("_MklAddN")                          \
256                               .Device(DEVICE_CPU)                   \
257                               .TypeConstraint<T>("T")               \
258                               .Label(mkl_op_registry::kMklOpLabel), \
259                           MklAddNOp<CPUDevice, T>);
260 
261 TF_CALL_float(REGISTER_MKL_CPU);
262 #undef REGISTER_MKL_CPU
263 }  // namespace tensorflow
264 #endif  // INTEL_MKL
265