libtextclassifier/smartselect/cached-features.h

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_
#define LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_

#include <memory>
#include <vector>

#include "base.h"
#include "common/vector-span.h"
#include "smartselect/types.h"

namespace libtextclassifier {

// Holds state for extracting features across multiple calls and reusing them.
// Assumes that features for each Token are independent.
class CachedFeatures {
 public:
  // Extracts the features for the given sequence of tokens.
  //  - context_size: Specifies how many tokens to the left, and how many
  //                   tokens to the right spans the context.
  //  - sparse_features, dense_features: Extracted features for each token.
  //  - feature_vector_fn: Writes features for given Token to the specified
  //                       storage.
  //                       NOTE: The function can assume that the underlying
  //                       storage is initialized to all zeros.
  //  - feature_vector_size: Size of a feature vector for one Token.
  CachedFeatures(VectorSpan<Token> tokens, int context_size,
                 const std::vector<std::vector<int>>& sparse_features,
                 const std::vector<std::vector<float>>& dense_features,
                 const std::function<bool(const std::vector<int>&,
                                          const std::vector<float>&, float*)>&
                     feature_vector_fn,
                 int feature_vector_size)
      : tokens_(tokens),
        context_size_(context_size),
        feature_vector_size_(feature_vector_size),
        remap_v0_feature_vector_(false),
        remap_v0_chargram_embedding_size_(-1) {
    Extract(sparse_features, dense_features, feature_vector_fn);
  }

  // Gets a VectorSpan with the features for given click position.
  bool Get(int click_pos, VectorSpan<float>* features,
           VectorSpan<Token>* output_tokens);

  // Turns on a compatibility mode, which re-maps the extracted features to the
  // v0 feature format (where the dense features were at the end).
  // WARNING: Internally v0_feature_storage_ is used as a backing buffer for
  // VectorSpan<float>, so the output of Extract is valid only until the next
  // call or destruction of the current CachedFeatures object.
  // TODO(zilka): Remove when we'll have retrained models.
  void SetV0FeatureMode(int chargram_embedding_size) {
    remap_v0_feature_vector_ = true;
    remap_v0_chargram_embedding_size_ = chargram_embedding_size;
    v0_feature_storage_.resize(feature_vector_size_ * (context_size_ * 2 + 1));
  }

 protected:
  // Extracts features for all tokens and stores them for later retrieval.
  void Extract(const std::vector<std::vector<int>>& sparse_features,
               const std::vector<std::vector<float>>& dense_features,
               const std::function<bool(const std::vector<int>&,
                                        const std::vector<float>&, float*)>&
                   feature_vector_fn);

  // Remaps extracted features to V0 feature format. The mapping is using
  // the v0_feature_storage_ as the backing storage for the mapped features.
  // For each token the features consist of:
  //  - chargram embeddings
  //  - dense features
  // They are concatenated together as [chargram embeddings; dense features]
  // for each token independently.
  // The V0 features require that the chargram embeddings for tokens are
  // concatenated first together, and at the end, the dense features for the
  // tokens are concatenated to it.
  void RemapV0FeatureVector(VectorSpan<float>* features);

 private:
  const VectorSpan<Token> tokens_;
  const int context_size_;
  const int feature_vector_size_;
  bool remap_v0_feature_vector_;
  int remap_v0_chargram_embedding_size_;

  std::vector<float> features_;
  std::vector<float> v0_feature_storage_;
};

}  // namespace libtextclassifier

#endif  // LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_