1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/sentencepiece/encoder.h"
18 
19 namespace libtextclassifier3 {
20 
Encode(StringPiece normalized_text,std::vector<int> * encoded_text) const21 bool Encoder::Encode(StringPiece normalized_text,
22                      std::vector<int>* encoded_text) const {
23   const int len = normalized_text.size();
24   if (len <= 0) {
25     *encoded_text = {start_code_, end_code_};
26     return true;
27   }
28   // We use `previous_pos` to indicate whether a dynamic programming state was
29   // reachable.
30   std::vector<SegmentationEntry> segmentation(
31       len + 1, {/*score=*/0, /*previous_pos=*/-1, /*piece_id=*/-1,
32                 /*num_pieces=*/0});
33   for (int i = 0; i < len; i++) {
34     // State couldn't be reached.
35     if (i > 0 && segmentation[i].previous_pos < 0) {
36       // Advance position.
37       normalized_text.RemovePrefix(1);
38       continue;
39     }
40     // Check whether we can use the unknown token.
41     if (unknown_code_ >= 0) {
42       const int pos = i + 1;
43       const float unknown_penalty = segmentation[i].score + unknown_score_;
44       if (segmentation[pos].previous_pos < 0 ||
45           segmentation[pos].score < unknown_penalty) {
46         // Merge multiple unknown tokens into one.
47         if (segmentation[i].piece_id == unknown_code_) {
48           segmentation[pos] = {/*score=*/unknown_penalty,
49                                /*previous_pos=*/segmentation[i].previous_pos,
50                                /*piece_id=*/unknown_code_,
51                                /*num_pieces=*/segmentation[i].num_pieces};
52         } else {
53           segmentation[pos] = {/*score=*/unknown_penalty,
54                                /*previous_pos=*/i,
55                                /*piece_id=*/unknown_code_,
56                                /*num_pieces=*/segmentation[i].num_pieces + 1};
57         }
58       }
59     }
60     std::vector<TrieMatch> matches;
61     if (!matcher_->FindAllPrefixMatches(normalized_text, &matches)) {
62       TC3_LOG(ERROR)
63           << "Couldn't successfully gather prefix sentence piece matches.";
64       return false;
65     }
66     for (const auto& match : matches) {
67       TC3_CHECK(match.id >= 0 && match.id < num_pieces_);
68       const int pos = i + match.match_length;
69       const float candidate_score = segmentation[i].score + scores_[match.id];
70       if (segmentation[pos].previous_pos < 0 ||
71           segmentation[pos].score < candidate_score) {
72         segmentation[pos] = {/*score=*/candidate_score, /*previous_pos=*/i,
73                              /*piece_id=*/match.id + encoding_offset_,
74                              /*num_pieces=*/segmentation[i].num_pieces + 1};
75       }
76     }
77     // Advance position.
78     normalized_text.RemovePrefix(1);
79   }
80   if (segmentation[len].num_pieces <= 0) {
81     *encoded_text = {start_code_, end_code_};
82     return true;
83   }
84   const int num_pieces = segmentation[len].num_pieces;
85   encoded_text->resize(num_pieces + 2);
86   (*encoded_text)[num_pieces + 1] = end_code_;
87   int pos = len;
88   for (int i = num_pieces; i > 0; i--) {
89     (*encoded_text)[i] = segmentation[pos].piece_id;
90     pos = segmentation[pos].previous_pos;
91   }
92   (*encoded_text)[0] = start_code_;
93   return true;
94 }
95 
96 }  // namespace libtextclassifier3
97