1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // An implementation of Unilib that uses Android Java interfaces via JNI. The
18 // performance critical ops have been re-implemented in C++.
19 // Specifically, this class must be compatible with API level 14 (ICS).
20 
21 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
22 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
23 
24 #include <jni.h>
25 #include <memory>
26 #include <mutex>  // NOLINT
27 #include <string>
28 
29 #include "utils/base/integral_types.h"
30 #include "utils/java/jni-cache.h"
31 #include "utils/java/scoped_global_ref.h"
32 #include "utils/java/scoped_local_ref.h"
33 #include "utils/java/string_utils.h"
34 #include "utils/utf8/unicodetext.h"
35 
36 namespace libtextclassifier3 {
37 
38 class UniLib {
39  public:
40   UniLib();
41   explicit UniLib(const std::shared_ptr<JniCache>& jni_cache);
42 
43   bool ParseInt32(const UnicodeText& text, int* result) const;
44   bool IsOpeningBracket(char32 codepoint) const;
45   bool IsClosingBracket(char32 codepoint) const;
46   bool IsWhitespace(char32 codepoint) const;
47   bool IsDigit(char32 codepoint) const;
48   bool IsUpper(char32 codepoint) const;
49 
50   char32 ToLower(char32 codepoint) const;
51   char32 GetPairedBracket(char32 codepoint) const;
52 
53   // Forward declaration for friend.
54   class RegexPattern;
55 
56   class RegexMatcher {
57    public:
58     static constexpr int kError = -1;
59     static constexpr int kNoError = 0;
60 
61     // Checks whether the input text matches the pattern exactly.
62     bool Matches(int* status) const;
63 
64     // Approximate Matches() implementation implemented using Find(). It uses
65     // the first Find() result and then checks that it spans the whole input.
66     // NOTE: Unlike Matches() it can result in false negatives.
67     // NOTE: Resets the matcher, so the current Find() state will be lost.
68     bool ApproximatelyMatches(int* status);
69 
70     // Finds occurrences of the pattern in the input text.
71     // Can be called repeatedly to find all occurences. A call will update
72     // internal state, so that 'Start', 'End' and 'Group' can be called to get
73     // information about the match.
74     // NOTE: Any call to ApproximatelyMatches() in between Find() calls will
75     // modify the state.
76     bool Find(int* status);
77 
78     // Gets the start offset of the last match (from  'Find').
79     // Sets status to 'kError' if 'Find'
80     // was not called previously.
81     int Start(int* status) const;
82 
83     // Gets the start offset of the specified group of the last match.
84     // (from  'Find').
85     // Sets status to 'kError' if an invalid group was specified or if 'Find'
86     // was not called previously.
87     int Start(int group_idx, int* status) const;
88 
89     // Gets the end offset of the last match (from  'Find').
90     // Sets status to 'kError' if 'Find'
91     // was not called previously.
92     int End(int* status) const;
93 
94     // Gets the end offset of the specified group of the last match.
95     // (from  'Find').
96     // Sets status to 'kError' if an invalid group was specified or if 'Find'
97     // was not called previously.
98     int End(int group_idx, int* status) const;
99 
100     // Gets the text of the last match (from 'Find').
101     // Sets status to 'kError' if 'Find' was not called previously.
102     UnicodeText Group(int* status) const;
103 
104     // Gets the text of the specified group of the last match (from 'Find').
105     // Sets status to 'kError' if an invalid group was specified or if 'Find'
106     // was not called previously.
107     UnicodeText Group(int group_idx, int* status) const;
108 
109     // Returns the matched text (the 0th capturing group).
Text()110     std::string Text() const {
111       ScopedStringChars text_str =
112           GetScopedStringChars(jni_cache_->GetEnv(), text_.get());
113       return text_str.get();
114     }
115 
116    private:
117     friend class RegexPattern;
118     RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher,
119                  ScopedGlobalRef<jstring> text);
120     bool UpdateLastFindOffset() const;
121 
122     const JniCache* jni_cache_;
123     ScopedGlobalRef<jobject> matcher_;
124     ScopedGlobalRef<jstring> text_;
125     mutable int last_find_offset_ = 0;
126     mutable int last_find_offset_codepoints_ = 0;
127     mutable bool last_find_offset_dirty_ = true;
128   };
129 
130   class RegexPattern {
131    public:
132     std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const;
133 
134    private:
135     friend class UniLib;
136     RegexPattern(const JniCache* jni_cache, const UnicodeText& pattern,
137                  bool lazy);
138     void LockedInitializeIfNotAlready() const;
139 
140     const JniCache* jni_cache_;
141 
142     // These members need to be mutable because of the lazy initialization.
143     // NOTE: The Matcher method first ensures (using a lock) that the
144     // initialization was attempted (by using LockedInitializeIfNotAlready) and
145     // then can access them without locking.
146     mutable std::mutex mutex_;
147     mutable ScopedGlobalRef<jobject> pattern_;
148     mutable bool initialized_;
149     mutable bool initialization_failure_;
150     mutable UnicodeText pattern_text_;
151   };
152 
153   class BreakIterator {
154    public:
155     int Next();
156 
157     static constexpr int kDone = -1;
158 
159    private:
160     friend class UniLib;
161     BreakIterator(const JniCache* jni_cache, const UnicodeText& text);
162 
163     const JniCache* jni_cache_;
164     ScopedGlobalRef<jstring> text_;
165     ScopedGlobalRef<jobject> iterator_;
166     int last_break_index_;
167     int last_unicode_index_;
168   };
169 
170   std::unique_ptr<RegexPattern> CreateRegexPattern(
171       const UnicodeText& regex) const;
172   std::unique_ptr<RegexPattern> CreateLazyRegexPattern(
173       const UnicodeText& regex) const;
174   std::unique_ptr<BreakIterator> CreateBreakIterator(
175       const UnicodeText& text) const;
176 
177  private:
178   std::shared_ptr<JniCache> jni_cache_;
179 };
180 
181 }  // namespace libtextclassifier3
182 
183 #endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
184