1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // This is an internal header file intended to only be included as the
17 // front-matter in the implementation files of various reduction ops.  It
18 // is a header file because we split the various reduction ops into their
19 // own compilation units to get more parallelism in compilation.
20 
21 #ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
22 #define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
23 
24 #define EIGEN_USE_THREADS
25 
26 #include "third_party/eigen3/Eigen/Core"
27 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
28 
29 #include "tensorflow/core/framework/numeric_op.h"
30 #include "tensorflow/core/framework/op_kernel.h"
31 #include "tensorflow/core/framework/register_types.h"
32 #include "tensorflow/core/framework/tensor.h"
33 #include "tensorflow/core/framework/types.h"
34 #include "tensorflow/core/kernels/reduction_ops.h"
35 #include "tensorflow/core/kernels/transpose_functor.h"
36 #include "tensorflow/core/lib/core/status.h"
37 #include "tensorflow/core/lib/gtl/inlined_vector.h"
38 #include "tensorflow/core/platform/logging.h"
39 
40 namespace tensorflow {
41 
42 typedef Eigen::ThreadPoolDevice CPUDevice;
43 typedef Eigen::GpuDevice GPUDevice;
44 
45 template <typename Device>
46 struct Constants {
47   // Derive Index type. int (32-bit) or long (64-bit) depending on the
48   // compile-time configuration. "float" here is not relevant.
49   // TODO(zhifengc): Moves the definition to TTypes.
50   typedef TTypes<float>::Tensor::Index Index;
51   Eigen::array<Index, 1> kZero;
52   Eigen::array<Index, 1> kOne;
53   Eigen::array<Index, 2> kZeroTwo;
54 
ConstantsConstants55   Constants() {
56     kZero[0] = 0;
57     kOne[0] = 1;
58     kZeroTwo[0] = 0;
59     kZeroTwo[1] = 2;
60   }
61 };
62 
63 #if defined(EIGEN_HAS_INDEX_LIST)
64 struct ConstantsBase {
65   const Eigen::IndexList<Eigen::type2index<0>> kZero;
66   const Eigen::IndexList<Eigen::type2index<1>> kOne;
67   const Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2>> kZeroTwo;
68 };
69 template <>
70 struct Constants<CPUDevice> : ConstantsBase {};
71 #endif  // EIGEN_HAS_INDEX_LIST
72 
73 class ReductionHelper {
74  public:
75   ReductionHelper() : reduce_first_axis_(false) {}
76 
77   Status Simplify(const Tensor& data, const Tensor& axis, const bool keep_dims);
78 
79   // We need to do roughly:
80   //   tmp_out = allocate(out_reshape())
81   //   tmp_out.reshape(out_reshape) = data.reshape(data_reshape).reduce(axes)
82   //   out = tmp_out.reshape(out_shape)
83 
84   // The reduction result must be allocated with this shape.
85   TensorShape out_reshape() const;
86 
87   // The final output shape must be allocated with this shape.
88   TensorShape out_shape() const;
89 
90   // The reduction is on a reshaped tensor of this rank.
91   int ndims() const { return data_reshape_.size(); }
92 
93   // True if need to reduce the 0-th dimension.
94   bool reduce_first_axis() const { return reduce_first_axis_; }
95 
96   // The output is reshaped.
97   template <typename T, int N>
98   typename TTypes<T, N>::Tensor out(Tensor* out) {
99     return out->shaped<T, N>(out_reshape_);
100   }
101 
102   // The input is reshaped.
103   template <typename T, int N>
104   typename TTypes<T, N>::ConstTensor in(const Tensor& data) {
105     return data.shaped<T, N>(data_reshape_);
106   }
107 
108   // Shape of shuffled input
109   TensorShape data_reshape() const {
110     TensorShape shape;
111     for (auto s : data_reshape_) shape.AddDim(s);
112     return shape;
113   }
114 
115   // Shape with all reduction dimensions at the end
116   TensorShape shuffled_shape();
117 
118   // Permutation of reduced dims needed to put reduction dimensions at the end
119   gtl::InlinedVector<int32, 8> permutation();
120 
121  private:
122   bool reduce_first_axis_;  // True if need to reduce the 0-th dimension.
123   gtl::InlinedVector<int64, 4> data_reshape_;  // Reshape data before reduction.
124   gtl::InlinedVector<int64, 4> out_shape_;     // The final output shape.
125   gtl::InlinedVector<int64, 4> out_reshape_;   // Reshape output for reduction.
126 };
127 
128 // For operations where the output is a reduction function along some
129 // dimensions of the input.
130 template <typename Device, class T, typename Tperm, typename Reducer>
131 class ReductionOp : public OpKernel {
132  public:
133   explicit ReductionOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
134     const DataType dt = DataTypeToEnum<T>::v();
135     const DataType pt = DataTypeToEnum<Tperm>::v();
136     OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, pt}, {dt}));
137 
138     OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
139   }
140 
141   void Compute(OpKernelContext* ctx) override {
142     const Tensor& data = ctx->input(0);
143     const Tensor& axes = ctx->input(1);
144     VLOG(1) << "data shape: " << data.shape().DebugString();
145     VLOG(1) << "axes      : " << axes.SummarizeValue(10);
146 
147     ReductionHelper helper;
148     OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_));
149     CHECK_GE(helper.ndims(), 0);
150 
151     bool is_scalar_identity = functor::ReducerTraits<Reducer>::IsScalarIdentity;
152     bool is_trivial = helper.ndims() == 0 ||
153                       (helper.ndims() == 1 && !helper.reduce_first_axis());
154     if (is_scalar_identity && is_trivial) {
155       Tensor out;
156       // Special case. Reduces nothing and does not alter the input values.
157       if (!out.CopyFrom(data, helper.out_shape())) {
158         ctx->SetStatus(errors::Internal("Error during reduction copy."));
159       }
160       ctx->set_output(0, out);
161       return;
162     }
163 
164     // We must allocate temp tensors using the same alloc attr as
165     // output(0) because it is returned as output(0) in the end.
166     const AllocatorAttributes alloc_attr = ctx->output_alloc_attr(0);
167 
168     Tensor tmp_out;
169     typedef functor::ReduceFunctor<Device, Reducer> Functor;
170     Constants<Device> constants;
171     const Device& d = ctx->eigen_device<Device>();
172     Reducer reducer;
173 
174     if (data.NumElements() > 0 && is_trivial && !is_scalar_identity) {
175       OP_REQUIRES_OK(ctx, ctx->allocate_temp(ctx->expected_output_dtype(0),
176                                              TensorShape({data.NumElements()}),
177                                              &tmp_out, alloc_attr));
178       Functor::Reduce(ctx, tmp_out.flat<T>(),
179                       data.shaped<T, 2>({1, data.NumElements()}),
180                       constants.kZero, reducer);
181     } else {
182       // A temporary tensor whose size matches the size of the reduced
183       // output.
184       OP_REQUIRES_OK(
185           ctx, ctx->allocate_temp(ctx->expected_output_dtype(0),
186                                   helper.out_reshape(), &tmp_out, alloc_attr));
187 
188       if (tmp_out.NumElements() == 0) {
189         // Nothing to do, fall through to final reshaping.
190       } else if (data.NumElements() == 0) {
191         // Degenerate reduction where the input is empty but the output is
192         // nonempty (thus tmp_out.NumElements() > 0), and we must fill the
193         // output with identity elements.  Example: tf.reduce_sum(tf.zeros((0,
194         // 3)), [0]). Eigen sometimes crashes in this case, so we do it
195         // manually.
196         Functor::FillIdentity(d, tmp_out.flat<T>(), reducer);
197       } else if ((helper.ndims() == 1) && helper.reduce_first_axis()) {
198         // Reduce to a scalar.
199         Functor::Reduce(ctx, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
200                         constants.kZero, reducer);
201       } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) {
202         // Can be viewed as a reduction of a matrix along 1st dimension.
203         Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
204                         constants.kZero, reducer);
205       } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) {
206         // Can be viewed as a reduction of a matrix along 2nd dimension.
207         Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
208                         constants.kOne, reducer);
209       } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) {
210         // Can be viewed as a reduction of a 3D tensor along 1st and 3rd
211         // dimensions.
212         Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
213                         constants.kZeroTwo, reducer);
214       } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) {
215         // Can be viewed as a reduction of a 3D tensor along 2nd dimension.
216         Functor::Reduce(ctx, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
217                         constants.kOne, reducer);
218       } else {
219         // If we don't hit one of the cases above, transpose the data so that
220         // all reduced dimensions are last and reuse the 2-D -> 1-D case.
221         Tensor data_reshaped;
222         OP_REQUIRES(ctx, data_reshaped.CopyFrom(data, helper.data_reshape()),
223                     errors::Internal("Error during reduction copy."));
224         Tensor shuffled;
225         OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
226                                                helper.shuffled_shape(),
227                                                &shuffled, alloc_attr));
228         OP_REQUIRES_OK(ctx, DoTranspose(d, data_reshaped, helper.permutation(),
229                                         &shuffled));
230         const int64 unreduced = tmp_out.NumElements();
231         const int64 reduced = shuffled.NumElements() / unreduced;
232         const Tensor& const_shuffled = shuffled;
233         Functor::Reduce(ctx, tmp_out.flat<T>(),
234                         const_shuffled.shaped<T, 2>({unreduced, reduced}),
235                         constants.kOne, reducer);
236       }
237     }
238 
239     // Set the real output using the contents of the reduction but the
240     // real expected output shape.  The number of elements should
241     // match between the two shapes.
242     Tensor out;
243     OP_REQUIRES(ctx, out.CopyFrom(tmp_out, helper.out_shape()),
244                 errors::Internal("Error during reduction copy."));
245     ctx->set_output(0, out);
246   }
247 
248  private:
249   // True if the number of dimensions should be maintained.
250   bool keep_dims_;
251 };
252 
253 namespace functor {
254 
255 template <typename Device, typename Reducer>
256 struct ReduceFunctorBase {
257   template <typename OUT_T, typename IN_T, typename ReductionAxes>
258   static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
259                      const ReductionAxes& reduction_axes,
260                      const Reducer& reducer) {
261     const Device& d = ctx->eigen_device<Device>();
262     ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes, Reducer> reducer_impl;
263     reducer_impl(d, out, in, reduction_axes, reducer);
264   }
265 
266   template <typename OUT_T>
267   static void FillIdentity(const Device& d, OUT_T out, const Reducer& reducer) {
268     FillIdentityEigenImpl(d, out, reducer);
269   }
270 };
271 
272 template <typename Reducer>
273 struct ReduceFunctor<CPUDevice, Reducer>
274     : ReduceFunctorBase<CPUDevice, Reducer> {};
275 
276 }  // namespace functor
277 }  // namespace tensorflow
278 
279 #endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
280