1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_SORTED_STRINGS_TABLE_H_
18 #define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_SORTED_STRINGS_TABLE_H_
19 
20 #include <functional>
21 #include <vector>
22 
23 #include "utils/base/integral_types.h"
24 #include "utils/sentencepiece/matcher.h"
25 #include "utils/strings/stringpiece.h"
26 
27 namespace libtextclassifier3 {
28 
29 // A matcher to find string pieces matching prefixes of an input string.
30 // The list of reference strings are kept in sorted order in a zero separated
31 // string.
32 // binary search is used to find all prefix matches.
33 // num_pieces: Number of sentence pieces.
34 // offsets: Offsets into `pieces` where a string starts.
35 // pieces: String pieces, concatenated in sorted order and zero byte separated.
36 // use_linear_scan_threshold: Minimum size of binary search range before
37 //     switching to a linear sweep for prefix match testing.
38 class SortedStringsTable : public SentencePieceMatcher {
39  public:
40   SortedStringsTable(const int num_pieces, const uint32* offsets,
41                      StringPiece pieces,
42                      const int use_linear_scan_threshold = 10)
num_pieces_(num_pieces)43       : num_pieces_(num_pieces),
44         offsets_(offsets),
45         pieces_(pieces),
46         use_linear_scan_threshold_(use_linear_scan_threshold) {}
47 
48   // Find matches that are prefixes of a string.
49   bool FindAllPrefixMatches(StringPiece input,
50                             std::vector<TrieMatch>* matches) const override;
51   // Find the longest prefix match of a string.
52   bool LongestPrefixMatch(StringPiece input,
53                           TrieMatch* longest_match) const override;
54 
55  private:
56   void GatherPrefixMatches(
57       StringPiece input, const std::function<void(TrieMatch)>& update_fn) const;
58 
59   const int num_pieces_;
60   const uint32* offsets_;
61   const StringPiece pieces_;
62   const int use_linear_scan_threshold_;
63 };
64 
65 }  // namespace libtextclassifier3
66 
67 #endif  // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_SORTED_STRINGS_TABLE_H_
68