1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/utf8/unilib-javaicu.h"
18 
19 #include <math.h>
20 
21 #include <cassert>
22 #include <cctype>
23 #include <map>
24 
25 #include "utils/base/logging.h"
26 #include "utils/base/statusor.h"
27 #include "utils/java/jni-base.h"
28 #include "utils/java/jni-helper.h"
29 #include "utils/utf8/unicodetext.h"
30 
31 namespace libtextclassifier3 {
32 
UniLibBase()33 UniLibBase::UniLibBase() {
34   TC3_LOG(FATAL) << "Java ICU UniLib must be initialized with a JniCache.";
35 }
36 
UniLibBase(const std::shared_ptr<JniCache> & jni_cache)37 UniLibBase::UniLibBase(const std::shared_ptr<JniCache>& jni_cache)
38     : jni_cache_(jni_cache) {}
39 
IsOpeningBracket(char32 codepoint) const40 bool UniLibBase::IsOpeningBracket(char32 codepoint) const {
41   return libtextclassifier3::IsOpeningBracket(codepoint);
42 }
43 
IsClosingBracket(char32 codepoint) const44 bool UniLibBase::IsClosingBracket(char32 codepoint) const {
45   return libtextclassifier3::IsClosingBracket(codepoint);
46 }
47 
IsWhitespace(char32 codepoint) const48 bool UniLibBase::IsWhitespace(char32 codepoint) const {
49   return libtextclassifier3::IsWhitespace(codepoint);
50 }
51 
IsDigit(char32 codepoint) const52 bool UniLibBase::IsDigit(char32 codepoint) const {
53   return libtextclassifier3::IsDigit(codepoint);
54 }
55 
IsLower(char32 codepoint) const56 bool UniLibBase::IsLower(char32 codepoint) const {
57   return libtextclassifier3::IsLower(codepoint);
58 }
59 
IsUpper(char32 codepoint) const60 bool UniLibBase::IsUpper(char32 codepoint) const {
61   return libtextclassifier3::IsUpper(codepoint);
62 }
63 
IsPunctuation(char32 codepoint) const64 bool UniLibBase::IsPunctuation(char32 codepoint) const {
65   return libtextclassifier3::IsPunctuation(codepoint);
66 }
67 
ToLower(char32 codepoint) const68 char32 UniLibBase::ToLower(char32 codepoint) const {
69   return libtextclassifier3::ToLower(codepoint);
70 }
71 
ToUpper(char32 codepoint) const72 char32 UniLibBase::ToUpper(char32 codepoint) const {
73   return libtextclassifier3::ToUpper(codepoint);
74 }
75 
GetPairedBracket(char32 codepoint) const76 char32 UniLibBase::GetPairedBracket(char32 codepoint) const {
77   return libtextclassifier3::GetPairedBracket(codepoint);
78 }
79 
80 // -----------------------------------------------------------------------------
81 // Implementations that call out to JVM. Behold the beauty.
82 // -----------------------------------------------------------------------------
83 
Length(const UnicodeText & text) const84 StatusOr<int32> UniLibBase::Length(const UnicodeText& text) const {
85   TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> text_java,
86                        jni_cache_->ConvertToJavaString(text));
87 
88   JNIEnv* jenv = jni_cache_->GetEnv();
89   TC3_ASSIGN_OR_RETURN(int utf16_length,
90                        JniHelper::CallIntMethod(jenv, text_java.get(),
91                                                 jni_cache_->string_length));
92 
93   return JniHelper::CallIntMethod(jenv, text_java.get(),
94                                   jni_cache_->string_code_point_count, 0,
95                                   utf16_length);
96 }
97 
ParseInt32(const UnicodeText & text,int32 * result) const98 bool UniLibBase::ParseInt32(const UnicodeText& text, int32* result) const {
99   return ParseInt(text, result);
100 }
101 
ParseInt64(const UnicodeText & text,int64 * result) const102 bool UniLibBase::ParseInt64(const UnicodeText& text, int64* result) const {
103   return ParseInt(text, result);
104 }
105 
ParseDouble(const UnicodeText & text,double * result) const106 bool UniLibBase::ParseDouble(const UnicodeText& text, double* result) const {
107   if (!jni_cache_) {
108     return false;
109   }
110 
111   auto it_dot = text.begin();
112   for (; it_dot != text.end() && !IsDot(*it_dot); it_dot++) {
113   }
114 
115   int32 integer_part;
116   if (!ParseInt(UnicodeText::Substring(text.begin(), it_dot, /*do_copy=*/false),
117                 &integer_part)) {
118     return false;
119   }
120 
121   int32 fractional_part = 0;
122   if (it_dot != text.end()) {
123     if (!ParseInt(
124             UnicodeText::Substring(++it_dot, text.end(), /*do_copy=*/false),
125             &fractional_part)) {
126       return false;
127     }
128   }
129 
130   double factional_part_double = fractional_part;
131   while (factional_part_double >= 1) {
132     factional_part_double /= 10;
133   }
134   *result = integer_part + factional_part_double;
135 
136   return true;
137 }
138 
CreateRegexPattern(const UnicodeText & regex) const139 std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateRegexPattern(
140     const UnicodeText& regex) const {
141   return std::unique_ptr<UniLibBase::RegexPattern>(
142       new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/false));
143 }
144 
CreateLazyRegexPattern(const UnicodeText & regex) const145 std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateLazyRegexPattern(
146     const UnicodeText& regex) const {
147   return std::unique_ptr<UniLibBase::RegexPattern>(
148       new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/true));
149 }
150 
RegexPattern(const JniCache * jni_cache,const UnicodeText & pattern,bool lazy)151 UniLibBase::RegexPattern::RegexPattern(const JniCache* jni_cache,
152                                        const UnicodeText& pattern, bool lazy)
153     : jni_cache_(jni_cache),
154       pattern_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
155       initialized_(false),
156       initialization_failure_(false),
157       pattern_text_(pattern) {
158   if (!lazy) {
159     LockedInitializeIfNotAlready();
160   }
161 }
162 
LockedInitializeIfNotAlready() const163 Status UniLibBase::RegexPattern::LockedInitializeIfNotAlready() const {
164   std::lock_guard<std::mutex> guard(mutex_);
165   if (initialized_ || initialization_failure_) {
166     return Status::OK;
167   }
168 
169   if (jni_cache_) {
170     JNIEnv* jenv = jni_cache_->GetEnv();
171     initialization_failure_ = true;
172     TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> regex_java,
173                          jni_cache_->ConvertToJavaString(pattern_text_));
174     TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jobject> pattern,
175                          JniHelper::CallStaticObjectMethod(
176                              jenv, jni_cache_->pattern_class.get(),
177                              jni_cache_->pattern_compile, regex_java.get()));
178     pattern_ = MakeGlobalRef(pattern.get(), jenv, jni_cache_->jvm);
179     if (pattern_ == nullptr) {
180       return Status::UNKNOWN;
181     }
182 
183     initialization_failure_ = false;
184     initialized_ = true;
185     pattern_text_.clear();  // We don't need this anymore.
186   }
187   return Status::OK;
188 }
189 
190 constexpr int UniLibBase::RegexMatcher::kError;
191 constexpr int UniLibBase::RegexMatcher::kNoError;
192 
Matcher(const UnicodeText & context) const193 std::unique_ptr<UniLibBase::RegexMatcher> UniLibBase::RegexPattern::Matcher(
194     const UnicodeText& context) const {
195   LockedInitializeIfNotAlready();  // Possibly lazy initialization.
196   if (initialization_failure_) {
197     return nullptr;
198   }
199 
200   if (jni_cache_) {
201     JNIEnv* env = jni_cache_->GetEnv();
202     const StatusOr<ScopedLocalRef<jstring>> status_or_context_java =
203         jni_cache_->ConvertToJavaString(context);
204     if (!status_or_context_java.ok() || !status_or_context_java.ValueOrDie()) {
205       return nullptr;
206     }
207     const StatusOr<ScopedLocalRef<jobject>> status_or_matcher =
208         JniHelper::CallObjectMethod(env, pattern_.get(),
209                                     jni_cache_->pattern_matcher,
210                                     status_or_context_java.ValueOrDie().get());
211     if (jni_cache_->ExceptionCheckAndClear() || !status_or_matcher.ok() ||
212         !status_or_matcher.ValueOrDie()) {
213       return nullptr;
214     }
215     return std::unique_ptr<UniLibBase::RegexMatcher>(new RegexMatcher(
216         jni_cache_,
217         MakeGlobalRef(status_or_matcher.ValueOrDie().get(), env,
218                       jni_cache_->jvm),
219         MakeGlobalRef(status_or_context_java.ValueOrDie().get(), env,
220                       jni_cache_->jvm)));
221   } else {
222     // NOTE: A valid object needs to be created here to pass the interface
223     // tests.
224     return std::unique_ptr<UniLibBase::RegexMatcher>(
225         new RegexMatcher(jni_cache_, {}, {}));
226   }
227 }
228 
RegexMatcher(const JniCache * jni_cache,ScopedGlobalRef<jobject> matcher,ScopedGlobalRef<jstring> text)229 UniLibBase::RegexMatcher::RegexMatcher(const JniCache* jni_cache,
230                                        ScopedGlobalRef<jobject> matcher,
231                                        ScopedGlobalRef<jstring> text)
232     : jni_cache_(jni_cache),
233       matcher_(std::move(matcher)),
234       text_(std::move(text)) {}
235 
Matches(int * status) const236 bool UniLibBase::RegexMatcher::Matches(int* status) const {
237   if (jni_cache_) {
238     *status = kNoError;
239     const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
240         matcher_.get(), jni_cache_->matcher_matches);
241     if (jni_cache_->ExceptionCheckAndClear()) {
242       *status = kError;
243       return false;
244     }
245     return result;
246   } else {
247     *status = kError;
248     return false;
249   }
250 }
251 
ApproximatelyMatches(int * status)252 bool UniLibBase::RegexMatcher::ApproximatelyMatches(int* status) {
253   *status = kNoError;
254 
255   jni_cache_->GetEnv()->CallObjectMethod(matcher_.get(),
256                                          jni_cache_->matcher_reset);
257   if (jni_cache_->ExceptionCheckAndClear()) {
258     *status = kError;
259     return kError;
260   }
261 
262   if (!Find(status) || *status != kNoError) {
263     return false;
264   }
265 
266   const int found_start = jni_cache_->GetEnv()->CallIntMethod(
267       matcher_.get(), jni_cache_->matcher_start_idx, 0);
268   if (jni_cache_->ExceptionCheckAndClear()) {
269     *status = kError;
270     return kError;
271   }
272 
273   const int found_end = jni_cache_->GetEnv()->CallIntMethod(
274       matcher_.get(), jni_cache_->matcher_end_idx, 0);
275   if (jni_cache_->ExceptionCheckAndClear()) {
276     *status = kError;
277     return kError;
278   }
279 
280   int context_length_bmp = jni_cache_->GetEnv()->CallIntMethod(
281       text_.get(), jni_cache_->string_length);
282   if (jni_cache_->ExceptionCheckAndClear()) {
283     *status = kError;
284     return false;
285   }
286 
287   if (found_start != 0 || found_end != context_length_bmp) {
288     return false;
289   }
290 
291   return true;
292 }
293 
UpdateLastFindOffset() const294 bool UniLibBase::RegexMatcher::UpdateLastFindOffset() const {
295   if (!last_find_offset_dirty_) {
296     return true;
297   }
298 
299   const int find_offset = jni_cache_->GetEnv()->CallIntMethod(
300       matcher_.get(), jni_cache_->matcher_start_idx, 0);
301   if (jni_cache_->ExceptionCheckAndClear()) {
302     return false;
303   }
304 
305   const int codepoint_count = jni_cache_->GetEnv()->CallIntMethod(
306       text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
307       find_offset);
308   if (jni_cache_->ExceptionCheckAndClear()) {
309     return false;
310   }
311 
312   last_find_offset_codepoints_ += codepoint_count;
313   last_find_offset_ = find_offset;
314   last_find_offset_dirty_ = false;
315 
316   return true;
317 }
318 
Find(int * status)319 bool UniLibBase::RegexMatcher::Find(int* status) {
320   if (jni_cache_) {
321     const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
322         matcher_.get(), jni_cache_->matcher_find);
323     if (jni_cache_->ExceptionCheckAndClear()) {
324       *status = kError;
325       return false;
326     }
327 
328     last_find_offset_dirty_ = true;
329     *status = kNoError;
330     return result;
331   } else {
332     *status = kError;
333     return false;
334   }
335 }
336 
Start(int * status) const337 int UniLibBase::RegexMatcher::Start(int* status) const {
338   return Start(/*group_idx=*/0, status);
339 }
340 
Start(int group_idx,int * status) const341 int UniLibBase::RegexMatcher::Start(int group_idx, int* status) const {
342   if (jni_cache_) {
343     *status = kNoError;
344 
345     if (!UpdateLastFindOffset()) {
346       *status = kError;
347       return kError;
348     }
349 
350     const int java_index = jni_cache_->GetEnv()->CallIntMethod(
351         matcher_.get(), jni_cache_->matcher_start_idx, group_idx);
352     if (jni_cache_->ExceptionCheckAndClear()) {
353       *status = kError;
354       return kError;
355     }
356 
357     // If the group didn't participate in the match the index is -1.
358     if (java_index == -1) {
359       return -1;
360     }
361 
362     const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
363         text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
364         java_index);
365     if (jni_cache_->ExceptionCheckAndClear()) {
366       *status = kError;
367       return kError;
368     }
369 
370     return unicode_index + last_find_offset_codepoints_;
371   } else {
372     *status = kError;
373     return kError;
374   }
375 }
376 
End(int * status) const377 int UniLibBase::RegexMatcher::End(int* status) const {
378   return End(/*group_idx=*/0, status);
379 }
380 
End(int group_idx,int * status) const381 int UniLibBase::RegexMatcher::End(int group_idx, int* status) const {
382   if (jni_cache_) {
383     *status = kNoError;
384 
385     if (!UpdateLastFindOffset()) {
386       *status = kError;
387       return kError;
388     }
389 
390     const int java_index = jni_cache_->GetEnv()->CallIntMethod(
391         matcher_.get(), jni_cache_->matcher_end_idx, group_idx);
392     if (jni_cache_->ExceptionCheckAndClear()) {
393       *status = kError;
394       return kError;
395     }
396 
397     // If the group didn't participate in the match the index is -1.
398     if (java_index == -1) {
399       return -1;
400     }
401 
402     const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
403         text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
404         java_index);
405     if (jni_cache_->ExceptionCheckAndClear()) {
406       *status = kError;
407       return kError;
408     }
409 
410     return unicode_index + last_find_offset_codepoints_;
411   } else {
412     *status = kError;
413     return kError;
414   }
415 }
416 
Group(int * status) const417 UnicodeText UniLibBase::RegexMatcher::Group(int* status) const {
418   if (jni_cache_) {
419     JNIEnv* jenv = jni_cache_->GetEnv();
420     StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
421         JniHelper::CallObjectMethod<jstring>(jenv, matcher_.get(),
422                                              jni_cache_->matcher_group);
423 
424     if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok() ||
425         !status_or_java_result.ValueOrDie()) {
426       *status = kError;
427       return UTF8ToUnicodeText("", /*do_copy=*/false);
428     }
429 
430     StatusOr<std::string> status_or_result =
431         JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get());
432     if (!status_or_result.ok()) {
433       *status = kError;
434       return UTF8ToUnicodeText("", /*do_copy=*/false);
435     }
436     *status = kNoError;
437     return UTF8ToUnicodeText(status_or_result.ValueOrDie(), /*do_copy=*/true);
438   } else {
439     *status = kError;
440     return UTF8ToUnicodeText("", /*do_copy=*/false);
441   }
442 }
443 
Group(int group_idx,int * status) const444 UnicodeText UniLibBase::RegexMatcher::Group(int group_idx, int* status) const {
445   if (jni_cache_) {
446     JNIEnv* jenv = jni_cache_->GetEnv();
447 
448     StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
449         JniHelper::CallObjectMethod<jstring>(
450             jenv, matcher_.get(), jni_cache_->matcher_group_idx, group_idx);
451     if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok()) {
452       *status = kError;
453       TC3_LOG(ERROR) << "Exception occurred";
454       return UTF8ToUnicodeText("", /*do_copy=*/false);
455     }
456 
457     // java_result is nullptr when the group did not participate in the match.
458     // For these cases other UniLib implementations return empty string, and
459     // the participation can be checked by checking if Start() == -1.
460     if (!status_or_java_result.ValueOrDie()) {
461       *status = kNoError;
462       return UTF8ToUnicodeText("", /*do_copy=*/false);
463     }
464 
465     StatusOr<std::string> status_or_result =
466         JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get());
467     if (!status_or_result.ok()) {
468       *status = kError;
469       return UTF8ToUnicodeText("", /*do_copy=*/false);
470     }
471     *status = kNoError;
472     return UTF8ToUnicodeText(status_or_result.ValueOrDie(), /*do_copy=*/true);
473   } else {
474     *status = kError;
475     return UTF8ToUnicodeText("", /*do_copy=*/false);
476   }
477 }
478 
479 constexpr int UniLibBase::BreakIterator::kDone;
480 
BreakIterator(const JniCache * jni_cache,const UnicodeText & text)481 UniLibBase::BreakIterator::BreakIterator(const JniCache* jni_cache,
482                                          const UnicodeText& text)
483     : jni_cache_(jni_cache),
484       text_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
485       iterator_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
486       last_break_index_(0),
487       last_unicode_index_(0) {
488   if (jni_cache_) {
489     JNIEnv* jenv = jni_cache_->GetEnv();
490     StatusOr<ScopedLocalRef<jstring>> status_or_text =
491         jni_cache_->ConvertToJavaString(text);
492     if (!status_or_text.ok()) {
493       return;
494     }
495     text_ =
496         MakeGlobalRef(status_or_text.ValueOrDie().get(), jenv, jni_cache->jvm);
497     if (!text_) {
498       return;
499     }
500 
501     StatusOr<ScopedLocalRef<jobject>> status_or_iterator =
502         JniHelper::CallStaticObjectMethod(
503             jenv, jni_cache->breakiterator_class.get(),
504             jni_cache->breakiterator_getwordinstance,
505             jni_cache->locale_us.get());
506     if (!status_or_iterator.ok()) {
507       return;
508     }
509     iterator_ = MakeGlobalRef(status_or_iterator.ValueOrDie().get(), jenv,
510                               jni_cache->jvm);
511     if (!iterator_) {
512       return;
513     }
514     JniHelper::CallVoidMethod(jenv, iterator_.get(),
515                               jni_cache->breakiterator_settext, text_.get());
516   }
517 }
518 
Next()519 int UniLibBase::BreakIterator::Next() {
520   if (jni_cache_) {
521     const int break_index = jni_cache_->GetEnv()->CallIntMethod(
522         iterator_.get(), jni_cache_->breakiterator_next);
523     if (jni_cache_->ExceptionCheckAndClear() ||
524         break_index == BreakIterator::kDone) {
525       return BreakIterator::kDone;
526     }
527 
528     const int token_unicode_length = jni_cache_->GetEnv()->CallIntMethod(
529         text_.get(), jni_cache_->string_code_point_count, last_break_index_,
530         break_index);
531     if (jni_cache_->ExceptionCheckAndClear()) {
532       return BreakIterator::kDone;
533     }
534 
535     last_break_index_ = break_index;
536     return last_unicode_index_ += token_unicode_length;
537   }
538   return BreakIterator::kDone;
539 }
540 
CreateBreakIterator(const UnicodeText & text) const541 std::unique_ptr<UniLibBase::BreakIterator> UniLibBase::CreateBreakIterator(
542     const UnicodeText& text) const {
543   return std::unique_ptr<UniLibBase::BreakIterator>(
544       new UniLibBase::BreakIterator(jni_cache_.get(), text));
545 }
546 
547 }  // namespace libtextclassifier3
548