1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Inference code for the text classification model.
18 
19 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
20 #define LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
21 
22 #include <memory>
23 #include <set>
24 #include <string>
25 #include <unordered_set>
26 #include <vector>
27 
28 #include "annotator/contact/contact-engine.h"
29 #include "annotator/datetime/datetime-grounder.h"
30 #include "annotator/datetime/parser.h"
31 #include "annotator/duration/duration.h"
32 #include "annotator/experimental/experimental.h"
33 #include "annotator/feature-processor.h"
34 #include "annotator/grammar/grammar-annotator.h"
35 #include "annotator/installed_app/installed-app-engine.h"
36 #include "annotator/knowledge/knowledge-engine.h"
37 #include "annotator/model-executor.h"
38 #include "annotator/model_generated.h"
39 #include "annotator/number/number.h"
40 #include "annotator/person_name/person-name-engine.h"
41 #include "annotator/pod_ner/pod-ner.h"
42 #include "annotator/strip-unpaired-brackets.h"
43 #include "annotator/translate/translate.h"
44 #include "annotator/types.h"
45 #include "annotator/vocab/vocab-annotator.h"
46 #include "annotator/zlib-utils.h"
47 #include "utils/base/status.h"
48 #include "utils/base/statusor.h"
49 #include "utils/calendar/calendar.h"
50 #include "utils/flatbuffers/flatbuffers.h"
51 #include "utils/flatbuffers/mutable.h"
52 #include "utils/i18n/locale.h"
53 #include "utils/memory/mmap.h"
54 #include "utils/utf8/unicodetext.h"
55 #include "utils/utf8/unilib.h"
56 #include "utils/zlib/zlib.h"
57 #include "lang_id/lang-id.h"
58 
59 namespace libtextclassifier3 {
60 
61 // Holds TFLite interpreters for selection and classification models.
62 // NOTE: This class is not thread-safe, thus should NOT be re-used across
63 // threads.
64 class InterpreterManager {
65  public:
66   // The constructor can be called with nullptr for any of the executors, and is
67   // a defined behavior, as long as the corresponding *Interpreter() method is
68   // not called when the executor is null.
InterpreterManager(const ModelExecutor * selection_executor,const ModelExecutor * classification_executor)69   InterpreterManager(const ModelExecutor* selection_executor,
70                      const ModelExecutor* classification_executor)
71       : selection_executor_(selection_executor),
72         classification_executor_(classification_executor) {}
73 
74   // Gets or creates and caches an interpreter for the selection model.
75   tflite::Interpreter* SelectionInterpreter();
76 
77   // Gets or creates and caches an interpreter for the classification model.
78   tflite::Interpreter* ClassificationInterpreter();
79 
80  private:
81   const ModelExecutor* selection_executor_;
82   const ModelExecutor* classification_executor_;
83 
84   std::unique_ptr<tflite::Interpreter> selection_interpreter_;
85   std::unique_ptr<tflite::Interpreter> classification_interpreter_;
86 };
87 
88 // Stores entity types enabled for annotation, and provides operator() for
89 // checking whether a given entity type is enabled.
90 class EnabledEntityTypes {
91  public:
EnabledEntityTypes(const std::unordered_set<std::string> & entity_types)92   explicit EnabledEntityTypes(
93       const std::unordered_set<std::string>& entity_types)
94       : entity_types_(entity_types) {}
95 
operator()96   bool operator()(const std::string& entity_type) const {
97     return entity_types_.empty() ||
98            entity_types_.find(entity_type) != entity_types_.cend();
99   }
100 
101  private:
102   const std::unordered_set<std::string>& entity_types_;
103 };
104 
105 // A text processing model that provides text classification, annotation,
106 // selection suggestion for various types.
107 // NOTE: This class is not thread-safe.
108 class Annotator {
109  public:
110   static std::unique_ptr<Annotator> FromUnownedBuffer(
111       const char* buffer, int size, const UniLib* unilib = nullptr,
112       const CalendarLib* calendarlib = nullptr);
113   // Copies the underlying model buffer string.
114   static std::unique_ptr<Annotator> FromString(
115       const std::string& buffer, const UniLib* unilib = nullptr,
116       const CalendarLib* calendarlib = nullptr);
117   // Takes ownership of the mmap.
118   static std::unique_ptr<Annotator> FromScopedMmap(
119       std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib = nullptr,
120       const CalendarLib* calendarlib = nullptr);
121   static std::unique_ptr<Annotator> FromScopedMmap(
122       std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,
123       std::unique_ptr<CalendarLib> calendarlib);
124   static std::unique_ptr<Annotator> FromFileDescriptor(
125       int fd, int offset, int size, const UniLib* unilib = nullptr,
126       const CalendarLib* calendarlib = nullptr);
127   static std::unique_ptr<Annotator> FromFileDescriptor(
128       int fd, int offset, int size, std::unique_ptr<UniLib> unilib,
129       std::unique_ptr<CalendarLib> calendarlib);
130   static std::unique_ptr<Annotator> FromFileDescriptor(
131       int fd, const UniLib* unilib = nullptr,
132       const CalendarLib* calendarlib = nullptr);
133   static std::unique_ptr<Annotator> FromFileDescriptor(
134       int fd, std::unique_ptr<UniLib> unilib,
135       std::unique_ptr<CalendarLib> calendarlib);
136   static std::unique_ptr<Annotator> FromPath(
137       const std::string& path, const UniLib* unilib = nullptr,
138       const CalendarLib* calendarlib = nullptr);
139   static std::unique_ptr<Annotator> FromPath(
140       const std::string& path, std::unique_ptr<UniLib> unilib,
141       std::unique_ptr<CalendarLib> calendarlib);
142 
143   // Returns true if the model is ready for use.
IsInitialized()144   bool IsInitialized() { return initialized_; }
145 
146   // Initializes the knowledge engine with the given config.
147   bool InitializeKnowledgeEngine(const std::string& serialized_config);
148 
149   // Initializes the contact engine with the given config.
150   bool InitializeContactEngine(const std::string& serialized_config);
151 
152   // Initializes the installed app engine with the given config.
153   bool InitializeInstalledAppEngine(const std::string& serialized_config);
154 
155   // Initializes the person name engine with the given person name model in the
156   // provided buffer. The buffer needs to outlive the annotator.
157   bool InitializePersonNameEngineFromUnownedBuffer(const void* buffer,
158                                                    int size);
159 
160   // Initializes the person name engine with the given person name model from
161   // the provided mmap.
162   bool InitializePersonNameEngineFromScopedMmap(const ScopedMmap& mmap);
163 
164   // Initializes the person name engine with the given person name model in the
165   // provided file path.
166   bool InitializePersonNameEngineFromPath(const std::string& path);
167 
168   // Initializes the person name engine with the given person name model in the
169   // provided file descriptor.
170   bool InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
171                                                     int size);
172 
173   // Initializes the experimental annotators if available.
174   // Returns true if there is an implementation of experimental annotators
175   // linked in.
176   bool InitializeExperimentalAnnotators();
177 
178   // Sets up the lang-id instance that should be used.
179   bool SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id);
180 
181   // Runs inference for given a context and current selection (i.e. index
182   // of the first and one past last selected characters (utf8 codepoint
183   // offsets)). Returns the indices (utf8 codepoint offsets) of the selection
184   // beginning character and one past selection end character.
185   // Returns the original click_indices if an error occurs.
186   // NOTE: The selection indices are passed in and returned in terms of
187   // UTF8 codepoints (not bytes).
188   // Requires that the model is a smart selection model.
189   CodepointSpan SuggestSelection(
190       const std::string& context, CodepointSpan click_indices,
191       const SelectionOptions& options = SelectionOptions()) const;
192 
193   // Classifies the selected text given the context string.
194   // Returns an empty result if an error occurs.
195   std::vector<ClassificationResult> ClassifyText(
196       const std::string& context, const CodepointSpan& selection_indices,
197       const ClassificationOptions& options = ClassificationOptions()) const;
198 
199   // Annotates the given structed input request. Models which handle the full
200   // context request will receive all the metadata they require. While models
201   // that don't use the extra context are called using only a string.
202   // For each fragment the annotations are sorted by their position in
203   // the fragment and exclude spans classified as 'other'.
204   //
205   // The number of vectors of annotated spans will match the number
206   // of input fragments. The order of annotation span vectors will match the
207   // order of input fragments. If annotation is not possible for any of the
208   // annotators, no annotation is returned.
209   StatusOr<Annotations> AnnotateStructuredInput(
210       const std::vector<InputFragment>& string_fragments,
211       const AnnotationOptions& options = AnnotationOptions()) const;
212 
213   // Annotates given input text. The annotations are sorted by their position
214   // in the context string and exclude spans classified as 'other'.
215   std::vector<AnnotatedSpan> Annotate(
216       const std::string& context,
217       const AnnotationOptions& options = AnnotationOptions()) const;
218 
219   // Looks up a knowledge entity by its id. Returns the serialized knowledge
220   // result.
221   StatusOr<std::string> LookUpKnowledgeEntity(const std::string& id) const;
222 
223   // Looks up an entity's property.
224   StatusOr<std::string> LookUpKnowledgeEntityProperty(
225       const std::string& mid_str, const std::string& property) const;
226 
227   const Model* model() const;
228   const reflection::Schema* entity_data_schema() const;
229 
230   // Exposes the feature processor for tests and evaluations.
231   const FeatureProcessor* SelectionFeatureProcessorForTests() const;
232   const FeatureProcessor* ClassificationFeatureProcessorForTests() const;
233 
234   // Exposes the date time parser for tests and evaluations.
235   const DatetimeParser* DatetimeParserForTests() const;
236 
237   static const std::string& kPhoneCollection;
238   static const std::string& kAddressCollection;
239   static const std::string& kDateCollection;
240   static const std::string& kUrlCollection;
241   static const std::string& kEmailCollection;
242 
243  protected:
244   struct ScoredChunk {
245     TokenSpan token_span;
246     float score;
247   };
248 
249   // NOTE: ValidateAndInitialize needs to be called before any other method.
Annotator()250   Annotator() : initialized_(false) {}
251 
252   // Checks that model contains all required fields, and initializes internal
253   // datastructures.
254   // Needs to be called before any other method is.
255   void ValidateAndInitialize(const Model* model, const UniLib* unilib,
256                              const CalendarLib* calendarlib);
257 
258   // Initializes regular expressions for the regex model.
259   bool InitializeRegexModel(ZlibDecompressor* decompressor);
260 
261   // Resolves conflicts in the list of candidates by removing some overlapping
262   // ones. Returns indices of the surviving ones.
263   // NOTE: Assumes that the candidates are sorted according to their position in
264   // the span.
265   bool ResolveConflicts(const std::vector<AnnotatedSpan>& candidates,
266                         const std::string& context,
267                         const std::vector<Token>& cached_tokens,
268                         const std::vector<Locale>& detected_text_language_tags,
269                         const BaseOptions& options,
270                         InterpreterManager* interpreter_manager,
271                         std::vector<int>* result) const;
272 
273   // Resolves one conflict between candidates on indices 'start_index'
274   // (inclusive) and 'end_index' (exclusive). Assigns the winning candidate
275   // indices to 'chosen_indices'. Returns false if a problem arises.
276   bool ResolveConflict(const std::string& context,
277                        const std::vector<Token>& cached_tokens,
278                        const std::vector<AnnotatedSpan>& candidates,
279                        const std::vector<Locale>& detected_text_language_tags,
280                        int start_index, int end_index,
281                        const BaseOptions& options,
282                        InterpreterManager* interpreter_manager,
283                        std::vector<int>* chosen_indices) const;
284 
285   // Gets selection candidates from the ML model.
286   // Provides the tokens produced during tokenization of the context string for
287   // reuse.
288   bool ModelSuggestSelection(
289       const UnicodeText& context_unicode, const CodepointSpan& click_indices,
290       const std::vector<Locale>& detected_text_language_tags,
291       InterpreterManager* interpreter_manager, std::vector<Token>* tokens,
292       std::vector<AnnotatedSpan>* result) const;
293 
294   // Classifies the selected text given the context string with the
295   // classification model.
296   // The following arguments are optional:
297   //   - cached_tokens - can be given as empty
298   //   - embedding_cache - can be given as nullptr
299   //   - tokens - can be given as nullptr
300   // Returns true if no error occurred.
301   bool ModelClassifyText(
302       const std::string& context, const std::vector<Token>& cached_tokens,
303       const std::vector<Locale>& detected_text_language_tags,
304       const CodepointSpan& selection_indices, const BaseOptions& options,
305       InterpreterManager* interpreter_manager,
306       FeatureProcessor::EmbeddingCache* embedding_cache,
307       std::vector<ClassificationResult>* classification_results,
308       std::vector<Token>* tokens) const;
309 
310   // Same as above, but (for optimization) takes the context as UnicodeText and
311   // takes the following extra arguments:
312   //   - span_begin, span_end - iterators in context_unicode corresponding to
313   //     selection_indices
314   //   - line - a UnicodeTextRange within context_unicode corresponding to the
315   //     line containing the selection - optional, can be given as nullptr
316   bool ModelClassifyText(
317       const UnicodeText& context_unicode,
318       const std::vector<Token>& cached_tokens,
319       const std::vector<Locale>& detected_text_language_tags,
320       const UnicodeText::const_iterator& span_begin,
321       const UnicodeText::const_iterator& span_end, const UnicodeTextRange* line,
322       const CodepointSpan& selection_indices, const BaseOptions& options,
323       InterpreterManager* interpreter_manager,
324       FeatureProcessor::EmbeddingCache* embedding_cache,
325       std::vector<ClassificationResult>* classification_results,
326       std::vector<Token>* tokens) const;
327 
328   // Returns a relative token span that represents how many tokens on the left
329   // from the selection and right from the selection are needed for the
330   // classifier input.
331   TokenSpan ClassifyTextUpperBoundNeededTokens() const;
332 
333   // Classifies the selected text with the regular expressions models.
334   // Returns true if no error happened, false otherwise.
335   bool RegexClassifyText(
336       const std::string& context, const CodepointSpan& selection_indices,
337       std::vector<ClassificationResult>* classification_result) const;
338 
339   // Classifies the selected text with the date time model.
340   // Returns true if no error happened, false otherwise.
341   bool DatetimeClassifyText(
342       const std::string& context, const CodepointSpan& selection_indices,
343       const ClassificationOptions& options,
344       std::vector<ClassificationResult>* classification_results) const;
345 
346   // Chunks given input text with the selection model and classifies the spans
347   // with the classification model.
348   // The annotations are sorted by their position in the context string and
349   // exclude spans classified as 'other'.
350   // Provides the tokens produced during tokenization of the context string for
351   // reuse.
352   bool ModelAnnotate(const std::string& context,
353                      const std::vector<Locale>& detected_text_language_tags,
354                      const AnnotationOptions& options,
355                      InterpreterManager* interpreter_manager,
356                      std::vector<Token>* tokens,
357                      std::vector<AnnotatedSpan>* result) const;
358 
359   // Groups the tokens into chunks. A chunk is a token span that should be the
360   // suggested selection when any of its contained tokens is clicked. The chunks
361   // are non-overlapping and are sorted by their position in the context string.
362   // "num_tokens" is the total number of tokens available (as this method does
363   // not need the actual vector of tokens).
364   // "span_of_interest" is a span of all the tokens that could be clicked.
365   // The resulting chunks all have to overlap with it and they cover this span
366   // completely. The first and last chunk might extend beyond it.
367   // The chunks vector is cleared before filling.
368   bool ModelChunk(int num_tokens, const TokenSpan& span_of_interest,
369                   tflite::Interpreter* selection_interpreter,
370                   const CachedFeatures& cached_features,
371                   std::vector<TokenSpan>* chunks) const;
372 
373   // A helper method for ModelChunk(). It generates scored chunk candidates for
374   // a click context model.
375   // NOTE: The returned chunks can (and most likely do) overlap.
376   bool ModelClickContextScoreChunks(
377       int num_tokens, const TokenSpan& span_of_interest,
378       const CachedFeatures& cached_features,
379       tflite::Interpreter* selection_interpreter,
380       std::vector<ScoredChunk>* scored_chunks) const;
381 
382   // A helper method for ModelChunk(). It generates scored chunk candidates for
383   // a bounds-sensitive model.
384   // NOTE: The returned chunks can (and most likely do) overlap.
385   bool ModelBoundsSensitiveScoreChunks(
386       int num_tokens, const TokenSpan& span_of_interest,
387       const TokenSpan& inference_span, const CachedFeatures& cached_features,
388       tflite::Interpreter* selection_interpreter,
389       std::vector<ScoredChunk>* scored_chunks) const;
390 
391   // Produces chunks isolated by a set of regular expressions.
392   bool RegexChunk(const UnicodeText& context_unicode,
393                   const std::vector<int>& rules,
394                   bool is_serialized_entity_data_enabled,
395                   const EnabledEntityTypes& enabled_entity_types,
396                   const AnnotationUsecase& annotation_usecase,
397 
398                   std::vector<AnnotatedSpan>* result) const;
399 
400   // Produces chunks from the datetime parser.
401   bool DatetimeChunk(const UnicodeText& context_unicode,
402                      int64 reference_time_ms_utc,
403                      const std::string& reference_timezone,
404                      const std::string& locales, ModeFlag mode,
405                      AnnotationUsecase annotation_usecase,
406                      bool is_serialized_entity_data_enabled,
407                      std::vector<AnnotatedSpan>* result) const;
408 
409   // Returns whether a classification should be filtered.
410   bool FilteredForAnnotation(const AnnotatedSpan& span) const;
411   bool FilteredForClassification(
412       const ClassificationResult& classification) const;
413   bool FilteredForSelection(const AnnotatedSpan& span) const;
414 
415   // Computes the selection boundaries from a regular expression match.
416   CodepointSpan ComputeSelectionBoundaries(
417       const UniLib::RegexMatcher* match,
418       const RegexModel_::Pattern* config) const;
419 
420   // Returns whether a regex pattern provides entity data from a match.
421   bool HasEntityData(const RegexModel_::Pattern* pattern) const;
422 
423   // Constructs and serializes entity data from regex matches.
424   bool SerializedEntityDataFromRegexMatch(
425       const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,
426       std::string* serialized_entity_data) const;
427 
428   // For knowledge candidates which have a ContactPointer, fill in the
429   // appropriate contact metadata, if possible.
430   void AddContactMetadataToKnowledgeClassificationResults(
431       std::vector<AnnotatedSpan>* candidates) const;
432 
433   // Gets priority score from the list of classification results.
434   float GetPriorityScore(
435       const std::vector<ClassificationResult>& classification) const;
436 
437   // Verifies a regex match and returns true if verification was successful.
438   bool VerifyRegexMatchCandidate(
439       const std::string& context,
440       const VerificationOptions* verification_options, const std::string& match,
441       const UniLib::RegexMatcher* matcher) const;
442 
443   const Model* model_;
444 
445   std::unique_ptr<const ModelExecutor> selection_executor_;
446   std::unique_ptr<const ModelExecutor> classification_executor_;
447   std::unique_ptr<const EmbeddingExecutor> embedding_executor_;
448 
449   std::unique_ptr<const FeatureProcessor> selection_feature_processor_;
450   std::unique_ptr<const FeatureProcessor> classification_feature_processor_;
451 
452   std::unique_ptr<const grammar::Analyzer> analyzer_;
453   std::unique_ptr<const DatetimeGrounder> datetime_grounder_;
454   std::unique_ptr<const DatetimeParser> datetime_parser_;
455   std::unique_ptr<const GrammarAnnotator> grammar_annotator_;
456 
457   std::string owned_buffer_;
458   std::unique_ptr<UniLib> owned_unilib_;
459   std::unique_ptr<CalendarLib> owned_calendarlib_;
460 
461  private:
462   struct CompiledRegexPattern {
463     const RegexModel_::Pattern* config;
464     std::unique_ptr<UniLib::RegexPattern> pattern;
465   };
466 
467   // Removes annotations the entity type of which is not in the set of enabled
468   // entity types.
469   void RemoveNotEnabledEntityTypes(
470       const EnabledEntityTypes& is_entity_type_enabled,
471       std::vector<AnnotatedSpan>* annotated_spans) const;
472 
473   // Runs only annotators that do not support structured input. Does conflict
474   // resolution, removal of disallowed entities and sorting on both new
475   // generated candidates and passed in entities.
476   // Returns Status::Error if the annotation failed, in which case the vector of
477   // candidates should be ignored.
478   Status AnnotateSingleInput(const std::string& context,
479                              const AnnotationOptions& options,
480                              std::vector<AnnotatedSpan>* candidates) const;
481 
482   // Parses the money amount into whole and decimal part and fills in the
483   // entity data information.
484   bool ParseAndFillInMoneyAmount(std::string* serialized_entity_data,
485                                  const UniLib::RegexMatcher* match,
486                                  const RegexModel_::Pattern* config,
487                                  const UnicodeText& context_unicode) const;
488 
489   // Given the regex capturing groups, extract the one representing the money
490   // quantity and fills in the actual string and the power of 10 the amount
491   // should be multiplied with.
492   void GetMoneyQuantityFromCapturingGroup(const UniLib::RegexMatcher* match,
493                                           const RegexModel_::Pattern* config,
494                                           const UnicodeText& context_unicode,
495                                           std::string* quantity,
496                                           int* exponent) const;
497 
498   // Returns true if any of the ff-model entity types is enabled.
499   bool IsAnyModelEntityTypeEnabled(
500       const EnabledEntityTypes& is_entity_type_enabled) const;
501 
502   // Returns true if any of the regex entity types is enabled.
503   bool IsAnyRegexEntityTypeEnabled(
504       const EnabledEntityTypes& is_entity_type_enabled) const;
505 
506   // Returns true if any of the POD NER entity types is enabled.
507   bool IsAnyPodNerEntityTypeEnabled(
508       const EnabledEntityTypes& is_entity_type_enabled) const;
509 
510   std::unique_ptr<ScopedMmap> mmap_;
511   bool initialized_ = false;
512   bool enabled_for_annotation_ = false;
513   bool enabled_for_classification_ = false;
514   bool enabled_for_selection_ = false;
515   std::unordered_set<std::string> filtered_collections_annotation_;
516   std::unordered_set<std::string> filtered_collections_classification_;
517   std::unordered_set<std::string> filtered_collections_selection_;
518 
519   std::vector<CompiledRegexPattern> regex_patterns_;
520 
521   // Indices into regex_patterns_ for the different modes.
522   std::vector<int> annotation_regex_patterns_, classification_regex_patterns_,
523       selection_regex_patterns_;
524 
525   const UniLib* unilib_;
526   const CalendarLib* calendarlib_;
527 
528   std::unique_ptr<const KnowledgeEngine> knowledge_engine_;
529   std::unique_ptr<const ContactEngine> contact_engine_;
530   std::unique_ptr<const InstalledAppEngine> installed_app_engine_;
531   std::unique_ptr<const NumberAnnotator> number_annotator_;
532   std::unique_ptr<const DurationAnnotator> duration_annotator_;
533   std::unique_ptr<const PersonNameEngine> person_name_engine_;
534   std::unique_ptr<const TranslateAnnotator> translate_annotator_;
535   std::unique_ptr<const PodNerAnnotator> pod_ner_annotator_;
536   std::unique_ptr<const ExperimentalAnnotator> experimental_annotator_;
537   std::unique_ptr<const VocabAnnotator> vocab_annotator_;
538 
539   // Builder for creating extra data.
540   const reflection::Schema* entity_data_schema_;
541   std::unique_ptr<MutableFlatbufferBuilder> entity_data_builder_;
542 
543   // Locales for which the entire model triggers.
544   std::vector<Locale> model_triggering_locales_;
545 
546   // Locales for which the ML model triggers.
547   std::vector<Locale> ml_model_triggering_locales_;
548 
549   // Locales that the dictionary classification support.
550   std::vector<Locale> dictionary_locales_;
551 
552   // Decimal and thousands number separators.
553   std::unordered_set<char32> money_separators_;
554 
555   // Model for language identification.
556   const libtextclassifier3::mobile::lang_id::LangId* lang_id_ = nullptr;
557 
558   // If true, will prioritize the longest annotation during conflict resolution.
559   bool prioritize_longest_annotation_ = false;
560 
561   // If true, the annotator will perform conflict resolution between the
562   // different sub-annotators also in the RAW mode. If false, no conflict
563   // resolution will be performed in RAW mode.
564   bool do_conflict_resolution_in_raw_mode_ = true;
565 };
566 
567 namespace internal {
568 
569 // Helper function, which if the initial 'span' contains only white-spaces,
570 // moves the selection to a single-codepoint selection on the left side
571 // of this block of white-space.
572 CodepointSpan SnapLeftIfWhitespaceSelection(const CodepointSpan& span,
573                                             const UnicodeText& context_unicode,
574                                             const UniLib& unilib);
575 
576 // Copies tokens from 'cached_tokens' that are
577 // 'tokens_around_selection_to_copy' (on the left, and right) tokens distant
578 // from the tokens that correspond to 'selection_indices'.
579 std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,
580                                     const CodepointSpan& selection_indices,
581                                     TokenSpan tokens_around_selection_to_copy);
582 }  // namespace internal
583 
584 // Interprets the buffer as a Model flatbuffer and returns it for reading.
585 const Model* ViewModel(const void* buffer, int size);
586 
587 // Opens model from given path and runs a function, passing the loaded Model
588 // flatbuffer as an argument.
589 //
590 // This is mainly useful if we don't want to pay the cost for the model
591 // initialization because we'll be only reading some flatbuffer values from the
592 // file.
593 template <typename ReturnType, typename Func>
VisitAnnotatorModel(const std::string & path,Func function)594 ReturnType VisitAnnotatorModel(const std::string& path, Func function) {
595   ScopedMmap mmap(path);
596   if (!mmap.handle().ok()) {
597     function(/*model=*/nullptr);
598   }
599   const Model* model =
600       ViewModel(mmap.handle().start(), mmap.handle().num_bytes());
601   return function(model);
602 }
603 
604 }  // namespace libtextclassifier3
605 
606 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
607