android-12.0.0_r2/s

/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "third_party/eigen3/Eigen/Core"
#include "third_party/eigen3/Eigen/LU"
#include "tensorflow/core/framework/kernel_def_builder.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/lib/math/math_util.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/work_sharder.h"

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;

template <typename Scalar, typename Tidx>
class LuOp : public OpKernel {
 public:
  explicit LuOp(OpKernelConstruction* context) : OpKernel(context) {}

 protected:
  using TensorShapes = gtl::InlinedVector<TensorShape, 4>;
  using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;

  using Matrix =
      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
  using ConstMatrixMap = Eigen::Map<const Matrix>;
  using MatrixMap = Eigen::Map<Matrix>;

  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;

  using Indices =
      Eigen::Matrix<Tidx, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
  using IndicesMap = Eigen::Map<Indices>;
  using ConstIndicesMap = Eigen::Map<const Indices>;

 public:
  // Returns the cost per matrix operation. This is used to determine the
  // number of threads to use for parallelizing factorization in batch mode.
  // Cost per unit is assumed to be roughly 1ns, based on comments
  // in core/util/work_sharder.cc.
  // LU decomposition for a square matrix takes roughly (2/3) * (num_rows)^3.
  // TODO(anudhyan): Refine this estimate after taking constant factors into
  // account.
  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) const {
    double num_rows = static_cast<double>(input_matrix_shape.dim_size(0));
    double cost = (2 / 3.0) * MathUtil::IPow(num_rows, 3);
    return cost >= static_cast<double>(kint64max) ? kint64max
                                                  : static_cast<int64>(cost);
  }

  void Compute(OpKernelContext* context) override {
    OP_REQUIRES(context, context->num_inputs() == 1,
                errors::InvalidArgument("Expecting exactly one input, got ",
                                        context->num_inputs()));

    const Tensor& input = context->input(0);
    int input_rank = input.dims();
    OP_REQUIRES(context, input_rank >= 2,
                errors::InvalidArgument(
                    "Input tensor must have rank >= 2, got ", input_rank));

    // If the tensor rank is greater than 2, we consider the inner-most
    // dimensions as matrices, and loop over all the other outer ("batch")
    // dimensions to compute the results.
    TensorShape input_matrix_shape;
    TensorShape batch_shape;
    for (int dim = 0; dim < input_rank - 2; ++dim) {
      batch_shape.AddDim(input.dim_size(dim));
    }
    const int64 num_rows = input.dim_size(input_rank - 2);
    const int64 num_cols = input.dim_size(input_rank - 1);

    input_matrix_shape.AppendShape({num_rows, num_cols});
    OP_REQUIRES(context, TensorShapeUtils::IsSquareMatrix(input_matrix_shape),
                errors::InvalidArgument("Input matrix must be square."));

    // packed_triangular_factors is a matrix with the same shape as the input;
    // permutation is a vector.
    TensorShape permutation_shape = batch_shape;
    permutation_shape.AddDim(num_rows);

    TensorShapes output_matrix_shapes({input.shape(), permutation_shape});

    TensorOutputs outputs;
    Tensor* output_packed_triangular_factors = nullptr;
    OP_REQUIRES_OK(
        context, context->forward_input_or_allocate_output(
                     {0}, 0, input.shape(), &output_packed_triangular_factors));
    outputs.emplace_back(output_packed_triangular_factors);

    Tensor* output_permutation = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(1, permutation_shape,
                                                     &output_permutation));
    outputs.emplace_back(output_permutation);

    if (num_rows == 0) {
      return;
    }

    // Process the individual matrix problems in parallel using a threadpool.
    auto shard = [this, &input, &num_rows, &num_cols, &outputs,
                  &output_matrix_shapes, context](int64 begin, int64 end) {
      for (int64 i = begin; i < end; ++i) {
        ComputeTensorSlice(context, i, input, num_rows, num_cols, outputs,
                           output_matrix_shapes);
      }
    };
    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
    Shard(worker_threads.num_threads, worker_threads.workers,
          batch_shape.num_elements(), GetCostPerUnit(input_matrix_shape),
          shard);
  }

  void ComputeTensorSlice(OpKernelContext* context, int64 matrix_index,
                          const Tensor& input, int64 num_rows, int64 num_cols,
                          const TensorOutputs& outputs,
                          const TensorShapes& output_matrix_shapes) {
    // TODO(kalakris): Handle alignment if possible. Eigen::Map is
    // unaligned by default.
    ConstMatrixMap input_matrix(
        input.flat<Scalar>().data() + matrix_index * num_rows * num_cols,
        num_rows, num_cols);

    // packed_triangular_factors has shape [num_rows, num_cols]
    MatrixMap packed_triangular_factors(
        outputs[0]->flat<Scalar>().data() + matrix_index * num_rows * num_cols,
        num_rows, num_rows);

    // permutation has shape [num_rows, 1]
    IndicesMap permutation_indices(
        outputs[1]->flat<Tidx>().data() + matrix_index * num_rows, num_rows, 1);

    Eigen::PartialPivLU<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>
        lu_decomposition(input_matrix);

    // Output the packed triangular factors in a dense form.
    // The lower triangular factor L corresponds to the strictly lower
    // triangular part of packed_triangular_factors with an implicit unit
    // diagonal. The upper triangular factor U is the upper triangular part of
    // packed_triangular_factors. The triangular factors satisfy the equation
    //     P * input_matrix = L * U
    // where P is the permutation matrix corresponding to the indices in
    // permutation_indices.
    packed_triangular_factors = lu_decomposition.matrixLU();
    // Output the permutation matrix used for pivoting.
    Eigen::PermutationMatrix<-1, -1, Tidx> permutation =
        lu_decomposition.permutationP().transpose();
    permutation_indices = permutation.indices();

    // PartialPivLU cannot give strong guarantees on invertibility,
    // but we can at least guard against exact zero pivots. This can occur as
    // a result of basic user mistakes such providing integer valued
    // matrices that are exactly singular, or due to underflow if this
    // code is run with denormals being flushed to zero.
    const RealScalar min_abs_pivot =
        packed_triangular_factors.diagonal().cwiseAbs().minCoeff();
    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
                errors::InvalidArgument("Input is not invertible."));
  }
};

#define REGISTER_LU(type, idx_type)                                         \
  REGISTER_KERNEL_BUILDER(Name("Lu")                                        \
                              .Device(DEVICE_CPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<idx_type>("output_idx_type"), \
                          LuOp<type, idx_type>);

REGISTER_LU(float, int32);
REGISTER_LU(double, int32);
REGISTER_LU(complex64, int32);
REGISTER_LU(complex128, int32);

REGISTER_LU(float, int64);
REGISTER_LU(double, int64);
REGISTER_LU(complex64, int64);
REGISTER_LU(complex128, int64);

}  // namespace tensorflow