1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // An implementation of Unilib that uses Android Java interfaces via JNI. The
18 // performance critical ops have been re-implemented in C++.
19 // Specifically, this class must be compatible with API level 14 (ICS).
20 
21 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
22 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
23 
24 #include <jni.h>
25 
26 #include <memory>
27 #include <mutex>  // NOLINT
28 #include <string>
29 
30 #include "utils/base/integral_types.h"
31 #include "utils/java/jni-base.h"
32 #include "utils/java/jni-cache.h"
33 #include "utils/java/jni-helper.h"
34 #include "utils/utf8/unicodetext.h"
35 #include "utils/utf8/unilib-common.h"
36 
37 namespace libtextclassifier3 {
38 
39 class UniLibBase {
40  public:
41   UniLibBase();
42   explicit UniLibBase(const std::shared_ptr<JniCache>& jni_cache);
43 
44   bool ParseInt32(const UnicodeText& text, int32* result) const;
45   bool ParseInt64(const UnicodeText& text, int64* result) const;
46   bool ParseDouble(const UnicodeText& text, double* result) const;
47 
48   bool IsOpeningBracket(char32 codepoint) const;
49   bool IsClosingBracket(char32 codepoint) const;
50   bool IsWhitespace(char32 codepoint) const;
51   bool IsDigit(char32 codepoint) const;
52   bool IsLower(char32 codepoint) const;
53   bool IsUpper(char32 codepoint) const;
54   bool IsPunctuation(char32 codepoint) const;
55 
56   char32 ToLower(char32 codepoint) const;
57   char32 ToUpper(char32 codepoint) const;
58   char32 GetPairedBracket(char32 codepoint) const;
59 
60   StatusOr<int32> Length(const UnicodeText& text) const;
61 
62   // Forward declaration for friend.
63   class RegexPattern;
64 
65   class RegexMatcher {
66    public:
67     static constexpr int kError = -1;
68     static constexpr int kNoError = 0;
69 
70     // Checks whether the input text matches the pattern exactly.
71     bool Matches(int* status) const;
72 
73     // Approximate Matches() implementation implemented using Find(). It uses
74     // the first Find() result and then checks that it spans the whole input.
75     // NOTE: Unlike Matches() it can result in false negatives.
76     // NOTE: Resets the matcher, so the current Find() state will be lost.
77     bool ApproximatelyMatches(int* status);
78 
79     // Finds occurrences of the pattern in the input text.
80     // Can be called repeatedly to find all occurrences. A call will update
81     // internal state, so that 'Start', 'End' and 'Group' can be called to get
82     // information about the match.
83     // NOTE: Any call to ApproximatelyMatches() in between Find() calls will
84     // modify the state.
85     bool Find(int* status);
86 
87     // Gets the start offset of the last match (from  'Find').
88     // Sets status to 'kError' if 'Find'
89     // was not called previously.
90     int Start(int* status) const;
91 
92     // Gets the start offset of the specified group of the last match.
93     // (from  'Find').
94     // Sets status to 'kError' if an invalid group was specified or if 'Find'
95     // was not called previously.
96     int Start(int group_idx, int* status) const;
97 
98     // Gets the end offset of the last match (from  'Find').
99     // Sets status to 'kError' if 'Find'
100     // was not called previously.
101     int End(int* status) const;
102 
103     // Gets the end offset of the specified group of the last match.
104     // (from  'Find').
105     // Sets status to 'kError' if an invalid group was specified or if 'Find'
106     // was not called previously.
107     int End(int group_idx, int* status) const;
108 
109     // Gets the text of the last match (from 'Find').
110     // Sets status to 'kError' if 'Find' was not called previously.
111     UnicodeText Group(int* status) const;
112 
113     // Gets the text of the specified group of the last match (from 'Find').
114     // Sets status to 'kError' if an invalid group was specified or if 'Find'
115     // was not called previously.
116     UnicodeText Group(int group_idx, int* status) const;
117 
118     // Returns the matched text (the 0th capturing group).
Text()119     std::string Text() const {
120       StatusOr<std::string> status_or_result =
121           JStringToUtf8String(jni_cache_->GetEnv(), text_.get());
122       if (!status_or_result.ok()) {
123         TC3_LOG(ERROR) << "JStringToUtf8String failed.";
124         return "";
125       }
126       return status_or_result.ValueOrDie();
127     }
128 
129    private:
130     friend class RegexPattern;
131     RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher,
132                  ScopedGlobalRef<jstring> text);
133     bool UpdateLastFindOffset() const;
134 
135     const JniCache* jni_cache_;
136     ScopedGlobalRef<jobject> matcher_;
137     ScopedGlobalRef<jstring> text_;
138     mutable int last_find_offset_ = 0;
139     mutable int last_find_offset_codepoints_ = 0;
140     mutable bool last_find_offset_dirty_ = true;
141   };
142 
143   class RegexPattern {
144    public:
145     std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const;
146 
147    private:
148     friend class UniLibBase;
149     RegexPattern(const JniCache* jni_cache, const UnicodeText& pattern,
150                  bool lazy);
151     Status LockedInitializeIfNotAlready() const;
152 
153     const JniCache* jni_cache_;
154 
155     // These members need to be mutable because of the lazy initialization.
156     // NOTE: The Matcher method first ensures (using a lock) that the
157     // initialization was attempted (by using LockedInitializeIfNotAlready) and
158     // then can access them without locking.
159     mutable std::mutex mutex_;
160     mutable ScopedGlobalRef<jobject> pattern_;
161     mutable bool initialized_;
162     mutable bool initialization_failure_;
163     mutable UnicodeText pattern_text_;
164   };
165 
166   class BreakIterator {
167    public:
168     int Next();
169 
170     static constexpr int kDone = -1;
171 
172    private:
173     friend class UniLibBase;
174     BreakIterator(const JniCache* jni_cache, const UnicodeText& text);
175 
176     const JniCache* jni_cache_;
177     ScopedGlobalRef<jstring> text_;
178     ScopedGlobalRef<jobject> iterator_;
179     int last_break_index_;
180     int last_unicode_index_;
181   };
182 
183   std::unique_ptr<RegexPattern> CreateRegexPattern(
184       const UnicodeText& regex) const;
185   std::unique_ptr<RegexPattern> CreateLazyRegexPattern(
186       const UnicodeText& regex) const;
187   std::unique_ptr<BreakIterator> CreateBreakIterator(
188       const UnicodeText& text) const;
189 
190  private:
191   template <class T>
192   bool ParseInt(const UnicodeText& text, T* result) const;
193 
194   std::shared_ptr<JniCache> jni_cache_;
195 };
196 
197 template <class T>
ParseInt(const UnicodeText & text,T * result)198 bool UniLibBase::ParseInt(const UnicodeText& text, T* result) const {
199   if (!jni_cache_) {
200     return false;
201   }
202 
203   // Avoid throwing exceptions when the text is unlikely to be a number.
204   int32 result32 = 0;
205   if (!PassesIntPreChesks(text, result32)) {
206     return false;
207   }
208 
209   JNIEnv* env = jni_cache_->GetEnv();
210   TC3_ASSIGN_OR_RETURN_FALSE(const ScopedLocalRef<jstring> text_java,
211                              jni_cache_->ConvertToJavaString(text));
212   TC3_ASSIGN_OR_RETURN_FALSE(
213       *result,
214       JniHelper::CallStaticIntMethod<T>(
215           env,
216           /*print_exception_on_error=*/false, jni_cache_->integer_class.get(),
217           jni_cache_->integer_parse_int, text_java.get()));
218   return true;
219 }
220 
221 }  // namespace libtextclassifier3
222 
223 #endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
224