1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/utf8/unilib-javaicu.h"
18 
19 #include <algorithm>
20 #include <cassert>
21 #include <cctype>
22 #include <map>
23 
24 #include "utils/java/string_utils.h"
25 
26 namespace libtextclassifier3 {
27 namespace {
28 
29 // -----------------------------------------------------------------------------
30 // Native implementations.
31 // -----------------------------------------------------------------------------
32 
33 #define ARRAYSIZE(a) sizeof(a) / sizeof(*a)
34 
35 // Derived from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
36 // grep -E "Ps" UnicodeData.txt | \
37 //   sed -rne "s/^([0-9A-Z]{4});.*(PAREN|BRACKET|BRAKCET|BRACE).*/0x\1, /p"
38 // IMPORTANT: entries with the same offsets in kOpeningBrackets and
39 //            kClosingBrackets must be counterparts.
40 constexpr char32 kOpeningBrackets[] = {
41     0x0028, 0x005B, 0x007B, 0x0F3C, 0x2045, 0x207D, 0x208D, 0x2329, 0x2768,
42     0x276A, 0x276C, 0x2770, 0x2772, 0x2774, 0x27E6, 0x27E8, 0x27EA, 0x27EC,
43     0x27EE, 0x2983, 0x2985, 0x2987, 0x2989, 0x298B, 0x298D, 0x298F, 0x2991,
44     0x2993, 0x2995, 0x2997, 0x29FC, 0x2E22, 0x2E24, 0x2E26, 0x2E28, 0x3008,
45     0x300A, 0x300C, 0x300E, 0x3010, 0x3014, 0x3016, 0x3018, 0x301A, 0xFD3F,
46     0xFE17, 0xFE35, 0xFE37, 0xFE39, 0xFE3B, 0xFE3D, 0xFE3F, 0xFE41, 0xFE43,
47     0xFE47, 0xFE59, 0xFE5B, 0xFE5D, 0xFF08, 0xFF3B, 0xFF5B, 0xFF5F, 0xFF62};
48 constexpr int kNumOpeningBrackets = ARRAYSIZE(kOpeningBrackets);
49 
50 // grep -E "Pe" UnicodeData.txt | \
51 //   sed -rne "s/^([0-9A-Z]{4});.*(PAREN|BRACKET|BRAKCET|BRACE).*/0x\1, /p"
52 constexpr char32 kClosingBrackets[] = {
53     0x0029, 0x005D, 0x007D, 0x0F3D, 0x2046, 0x207E, 0x208E, 0x232A, 0x2769,
54     0x276B, 0x276D, 0x2771, 0x2773, 0x2775, 0x27E7, 0x27E9, 0x27EB, 0x27ED,
55     0x27EF, 0x2984, 0x2986, 0x2988, 0x298A, 0x298C, 0x298E, 0x2990, 0x2992,
56     0x2994, 0x2996, 0x2998, 0x29FD, 0x2E23, 0x2E25, 0x2E27, 0x2E29, 0x3009,
57     0x300B, 0x300D, 0x300F, 0x3011, 0x3015, 0x3017, 0x3019, 0x301B, 0xFD3E,
58     0xFE18, 0xFE36, 0xFE38, 0xFE3A, 0xFE3C, 0xFE3E, 0xFE40, 0xFE42, 0xFE44,
59     0xFE48, 0xFE5A, 0xFE5C, 0xFE5E, 0xFF09, 0xFF3D, 0xFF5D, 0xFF60, 0xFF63};
60 constexpr int kNumClosingBrackets = ARRAYSIZE(kClosingBrackets);
61 
62 // grep -E "WS" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
63 constexpr char32 kWhitespaces[] = {
64     0x000C,  0x0020,  0x1680,  0x2000,  0x2001,  0x2002,  0x2003,  0x2004,
65     0x2005,  0x2006,  0x2007,  0x2008,  0x2009,  0x200A,  0x2028,  0x205F,
66     0x21C7,  0x21C8,  0x21C9,  0x21CA,  0x21F6,  0x2B31,  0x2B84,  0x2B85,
67     0x2B86,  0x2B87,  0x2B94,  0x3000,  0x4DCC,  0x10344, 0x10347, 0x1DA0A,
68     0x1DA0B, 0x1DA0C, 0x1DA0D, 0x1DA0E, 0x1DA0F, 0x1DA10, 0x1F4F0, 0x1F500,
69     0x1F501, 0x1F502, 0x1F503, 0x1F504, 0x1F5D8, 0x1F5DE};
70 constexpr int kNumWhitespaces = ARRAYSIZE(kWhitespaces);
71 
72 // grep -E "Nd" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
73 // As the name suggests, these ranges are always 10 codepoints long, so we just
74 // store the end of the range.
75 constexpr char32 kDecimalDigitRangesEnd[] = {
76     0x0039,  0x0669,  0x06f9,  0x07c9,  0x096f,  0x09ef,  0x0a6f,  0x0aef,
77     0x0b6f,  0x0bef,  0x0c6f,  0x0cef,  0x0d6f,  0x0def,  0x0e59,  0x0ed9,
78     0x0f29,  0x1049,  0x1099,  0x17e9,  0x1819,  0x194f,  0x19d9,  0x1a89,
79     0x1a99,  0x1b59,  0x1bb9,  0x1c49,  0x1c59,  0xa629,  0xa8d9,  0xa909,
80     0xa9d9,  0xa9f9,  0xaa59,  0xabf9,  0xff19,  0x104a9, 0x1106f, 0x110f9,
81     0x1113f, 0x111d9, 0x112f9, 0x11459, 0x114d9, 0x11659, 0x116c9, 0x11739,
82     0x118e9, 0x11c59, 0x11d59, 0x16a69, 0x16b59, 0x1d7ff};
83 constexpr int kNumDecimalDigitRangesEnd = ARRAYSIZE(kDecimalDigitRangesEnd);
84 
85 // grep -E "Lu" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
86 // There are three common ways in which upper/lower case codepoint ranges
87 // were introduced: one offs, dense ranges, and ranges that alternate between
88 // lower and upper case. For the sake of keeping out binary size down, we
89 // treat each independently.
90 constexpr char32 kUpperSingles[] = {
91     0x01b8, 0x01bc, 0x01c4, 0x01c7, 0x01ca, 0x01f1, 0x0376, 0x037f,
92     0x03cf, 0x03f4, 0x03fa, 0x10c7, 0x10cd, 0x2102, 0x2107, 0x2115,
93     0x2145, 0x2183, 0x2c72, 0x2c75, 0x2cf2, 0xa7b6};
94 constexpr int kNumUpperSingles = ARRAYSIZE(kUpperSingles);
95 constexpr char32 kUpperRanges1Start[] = {
96     0x0041, 0x00c0, 0x00d8, 0x0181, 0x018a, 0x018e, 0x0193, 0x0196,
97     0x019c, 0x019f, 0x01b2, 0x01f7, 0x023a, 0x023d, 0x0244, 0x0389,
98     0x0392, 0x03a3, 0x03d2, 0x03fd, 0x0531, 0x10a0, 0x13a0, 0x1f08,
99     0x1f18, 0x1f28, 0x1f38, 0x1f48, 0x1f68, 0x1fb8, 0x1fc8, 0x1fd8,
100     0x1fe8, 0x1ff8, 0x210b, 0x2110, 0x2119, 0x212b, 0x2130, 0x213e,
101     0x2c00, 0x2c63, 0x2c6e, 0x2c7e, 0xa7ab, 0xa7b0};
102 constexpr int kNumUpperRanges1Start = ARRAYSIZE(kUpperRanges1Start);
103 constexpr char32 kUpperRanges1End[] = {
104     0x005a, 0x00d6, 0x00de, 0x0182, 0x018b, 0x0191, 0x0194, 0x0198,
105     0x019d, 0x01a0, 0x01b3, 0x01f8, 0x023b, 0x023e, 0x0246, 0x038a,
106     0x03a1, 0x03ab, 0x03d4, 0x042f, 0x0556, 0x10c5, 0x13f5, 0x1f0f,
107     0x1f1d, 0x1f2f, 0x1f3f, 0x1f4d, 0x1f6f, 0x1fbb, 0x1fcb, 0x1fdb,
108     0x1fec, 0x1ffb, 0x210d, 0x2112, 0x211d, 0x212d, 0x2133, 0x213f,
109     0x2c2e, 0x2c64, 0x2c70, 0x2c80, 0xa7ae, 0xa7b4};
110 constexpr int kNumUpperRanges1End = ARRAYSIZE(kUpperRanges1End);
111 constexpr char32 kUpperRanges2Start[] = {
112     0x0100, 0x0139, 0x014a, 0x0179, 0x0184, 0x0187, 0x01a2, 0x01a7, 0x01ac,
113     0x01af, 0x01b5, 0x01cd, 0x01de, 0x01f4, 0x01fa, 0x0241, 0x0248, 0x0370,
114     0x0386, 0x038c, 0x038f, 0x03d8, 0x03f7, 0x0460, 0x048a, 0x04c1, 0x04d0,
115     0x1e00, 0x1e9e, 0x1f59, 0x2124, 0x2c60, 0x2c67, 0x2c82, 0x2ceb, 0xa640,
116     0xa680, 0xa722, 0xa732, 0xa779, 0xa77e, 0xa78b, 0xa790, 0xa796};
117 constexpr int kNumUpperRanges2Start = ARRAYSIZE(kUpperRanges2Start);
118 constexpr char32 kUpperRanges2End[] = {
119     0x0136, 0x0147, 0x0178, 0x017d, 0x0186, 0x0189, 0x01a6, 0x01a9, 0x01ae,
120     0x01b1, 0x01b7, 0x01db, 0x01ee, 0x01f6, 0x0232, 0x0243, 0x024e, 0x0372,
121     0x0388, 0x038e, 0x0391, 0x03ee, 0x03f9, 0x0480, 0x04c0, 0x04cd, 0x052e,
122     0x1e94, 0x1efe, 0x1f5f, 0x212a, 0x2c62, 0x2c6d, 0x2ce2, 0x2ced, 0xa66c,
123     0xa69a, 0xa72e, 0xa76e, 0xa77d, 0xa786, 0xa78d, 0xa792, 0xa7aa};
124 constexpr int kNumUpperRanges2End = ARRAYSIZE(kUpperRanges2End);
125 
126 // grep -E "Lu" UnicodeData.txt | \
127 //   sed -rne "s/^([0-9A-Z]+);.*;([0-9A-Z]+);$/(0x\1, 0x\2), /p"
128 // We have two strategies for mapping from upper to lower case. We have single
129 // character lookups that do not follow a pattern, and ranges for which there
130 // is a constant codepoint shift.
131 // Note that these ranges ignore anything that's not an upper case character,
132 // so when applied to a non-uppercase character the result is incorrect.
133 constexpr int kToLowerSingles[] = {
134     0x0130, 0x0178, 0x0181, 0x0186, 0x018b, 0x018e, 0x018f, 0x0190, 0x0191,
135     0x0194, 0x0196, 0x0197, 0x0198, 0x019c, 0x019d, 0x019f, 0x01a6, 0x01a9,
136     0x01ae, 0x01b7, 0x01f6, 0x01f7, 0x0220, 0x023a, 0x023d, 0x023e, 0x0243,
137     0x0244, 0x0245, 0x037f, 0x0386, 0x038c, 0x03cf, 0x03f4, 0x03f9, 0x04c0,
138     0x1e9e, 0x1fec, 0x2126, 0x212a, 0x212b, 0x2132, 0x2183, 0x2c60, 0x2c62,
139     0x2c63, 0x2c64, 0x2c6d, 0x2c6e, 0x2c6f, 0x2c70, 0xa77d, 0xa78d, 0xa7aa,
140     0xa7ab, 0xa7ac, 0xa7ad, 0xa7ae, 0xa7b0, 0xa7b1, 0xa7b2, 0xa7b3};
141 constexpr int kNumToLowerSingles = ARRAYSIZE(kToLowerSingles);
142 constexpr int kToLowerSinglesOffsets[] = {
143     -199,   -121,   210,    206,    1,      79,     202,    203,    1,
144     207,    211,    209,    1,      211,    213,    214,    218,    218,
145     218,    219,    -97,    -56,    -130,   10795,  -163,   10792,  -195,
146     69,     71,     116,    38,     64,     8,      -60,    -7,     15,
147     -7615,  -7,     -7517,  -8383,  -8262,  28,     1,      1,      -10743,
148     -3814,  -10727, -10780, -10749, -10783, -10782, -35332, -42280, -42308,
149     -42319, -42315, -42305, -42308, -42258, -42282, -42261, 928};
150 constexpr int kNumToLowerSinglesOffsets = ARRAYSIZE(kToLowerSinglesOffsets);
151 constexpr int kToLowerRangesStart[] = {
152     0x0041, 0x0100, 0x0189, 0x01a0, 0x01b1, 0x01b3, 0x0388,  0x038e,  0x0391,
153     0x03d8, 0x03fd, 0x0400, 0x0410, 0x0460, 0x0531, 0x10a0,  0x13a0,  0x13f0,
154     0x1e00, 0x1f08, 0x1fba, 0x1fc8, 0x1fd8, 0x1fda, 0x1fe8,  0x1fea,  0x1ff8,
155     0x1ffa, 0x2c00, 0x2c67, 0x2c7e, 0x2c80, 0xff21, 0x10400, 0x10c80, 0x118a0};
156 constexpr int kNumToLowerRangesStart = ARRAYSIZE(kToLowerRangesStart);
157 constexpr int kToLowerRangesEnd[] = {
158     0x00de, 0x0187, 0x019f, 0x01af, 0x01b2, 0x0386, 0x038c,  0x038f,  0x03cf,
159     0x03fa, 0x03ff, 0x040f, 0x042f, 0x052e, 0x0556, 0x10cd,  0x13ef,  0x13f5,
160     0x1efe, 0x1fb9, 0x1fbb, 0x1fcb, 0x1fd9, 0x1fdb, 0x1fe9,  0x1fec,  0x1ff9,
161     0x2183, 0x2c64, 0x2c75, 0x2c7f, 0xa7b6, 0xff3a, 0x104d3, 0x10cb2, 0x118bf};
162 constexpr int kNumToLowerRangesEnd = ARRAYSIZE(kToLowerRangesEnd);
163 constexpr int kToLowerRangesOffsets[] = {
164     32, 1,    205,  1,    217,   1, 37,     63, 32,  1,   -130, 80,
165     32, 1,    48,   7264, 38864, 8, 1,      -8, -74, -86, -8,   -100,
166     -8, -112, -128, -126, 48,    1, -10815, 1,  32,  40,  64,   32};
167 constexpr int kNumToLowerRangesOffsets = ARRAYSIZE(kToLowerRangesOffsets);
168 
169 #undef ARRAYSIZE
170 
171 static_assert(kNumOpeningBrackets == kNumClosingBrackets,
172               "mismatching number of opening and closing brackets");
173 static_assert(kNumUpperRanges1Start == kNumUpperRanges1End,
174               "number of uppercase stride 1 range starts/ends doesn't match");
175 static_assert(kNumUpperRanges2Start == kNumUpperRanges2End,
176               "number of uppercase stride 2 range starts/ends doesn't match");
177 static_assert(kNumToLowerSingles == kNumToLowerSinglesOffsets,
178               "number of to lower singles and offsets doesn't match");
179 static_assert(kNumToLowerRangesStart == kNumToLowerRangesEnd,
180               "mismatching number of range starts/ends for to lower ranges");
181 static_assert(kNumToLowerRangesStart == kNumToLowerRangesOffsets,
182               "number of to lower ranges and offsets doesn't match");
183 
184 constexpr int kNoMatch = -1;
185 
186 // Returns the index of the element in the array that matched the given
187 // codepoint, or kNoMatch if the element didn't exist.
188 // The input array must be in sorted order.
GetMatchIndex(const char32 * array,int array_length,char32 c)189 int GetMatchIndex(const char32* array, int array_length, char32 c) {
190   const char32* end = array + array_length;
191   const auto find_it = std::lower_bound(array, end, c);
192   if (find_it != end && *find_it == c) {
193     return find_it - array;
194   } else {
195     return kNoMatch;
196   }
197 }
198 
199 // Returns the index of the range in the array that overlapped the given
200 // codepoint, or kNoMatch if no such range existed.
201 // The input array must be in sorted order.
GetOverlappingRangeIndex(const char32 * arr,int arr_length,int range_length,char32 c)202 int GetOverlappingRangeIndex(const char32* arr, int arr_length,
203                              int range_length, char32 c) {
204   const char32* end = arr + arr_length;
205   const auto find_it = std::lower_bound(arr, end, c);
206   if (find_it == end) {
207     return kNoMatch;
208   }
209   // The end is inclusive, we so subtract one less than the range length.
210   const char32 range_end = *find_it;
211   const char32 range_start = range_end - (range_length - 1);
212   if (c < range_start || range_end < c) {
213     return kNoMatch;
214   } else {
215     return find_it - arr;
216   }
217 }
218 
219 // As above, but with explicit codepoint start and end indices for the range.
220 // The input array must be in sorted order.
GetOverlappingRangeIndex(const char32 * start_arr,const char32 * end_arr,int arr_length,int stride,char32 c)221 int GetOverlappingRangeIndex(const char32* start_arr, const char32* end_arr,
222                              int arr_length, int stride, char32 c) {
223   const char32* end_arr_end = end_arr + arr_length;
224   const auto find_it = std::lower_bound(end_arr, end_arr_end, c);
225   if (find_it == end_arr_end) {
226     return kNoMatch;
227   }
228   // Find the corresponding start.
229   const int range_index = find_it - end_arr;
230   const char32 range_start = start_arr[range_index];
231   const char32 range_end = *find_it;
232   if (c < range_start || range_end < c) {
233     return kNoMatch;
234   }
235   if ((c - range_start) % stride == 0) {
236     return range_index;
237   } else {
238     return kNoMatch;
239   }
240 }
241 
242 }  // anonymous namespace
243 
UniLib()244 UniLib::UniLib() {
245   TC3_LOG(FATAL) << "Java ICU UniLib must be initialized with a JniCache.";
246 }
247 
UniLib(const std::shared_ptr<JniCache> & jni_cache)248 UniLib::UniLib(const std::shared_ptr<JniCache>& jni_cache)
249     : jni_cache_(jni_cache) {}
250 
IsOpeningBracket(char32 codepoint) const251 bool UniLib::IsOpeningBracket(char32 codepoint) const {
252   return GetMatchIndex(kOpeningBrackets, kNumOpeningBrackets, codepoint) >= 0;
253 }
254 
IsClosingBracket(char32 codepoint) const255 bool UniLib::IsClosingBracket(char32 codepoint) const {
256   return GetMatchIndex(kClosingBrackets, kNumClosingBrackets, codepoint) >= 0;
257 }
258 
IsWhitespace(char32 codepoint) const259 bool UniLib::IsWhitespace(char32 codepoint) const {
260   return GetMatchIndex(kWhitespaces, kNumWhitespaces, codepoint) >= 0;
261 }
262 
IsDigit(char32 codepoint) const263 bool UniLib::IsDigit(char32 codepoint) const {
264   return GetOverlappingRangeIndex(kDecimalDigitRangesEnd,
265                                   kNumDecimalDigitRangesEnd,
266                                   /*range_length=*/10, codepoint) >= 0;
267 }
268 
IsUpper(char32 codepoint) const269 bool UniLib::IsUpper(char32 codepoint) const {
270   if (GetMatchIndex(kUpperSingles, kNumUpperSingles, codepoint) >= 0) {
271     return true;
272   } else if (GetOverlappingRangeIndex(kUpperRanges1Start, kUpperRanges1End,
273                                       kNumUpperRanges1Start, /*stride=*/1,
274                                       codepoint) >= 0) {
275     return true;
276   } else if (GetOverlappingRangeIndex(kUpperRanges2Start, kUpperRanges2End,
277                                       kNumUpperRanges2Start, /*stride=*/2,
278                                       codepoint) >= 0) {
279     return true;
280   } else {
281     return false;
282   }
283 }
284 
ToLower(char32 codepoint) const285 char32 UniLib::ToLower(char32 codepoint) const {
286   // Make sure we still produce output even if the method is called for a
287   // codepoint that's not an uppercase character.
288   if (!IsUpper(codepoint)) {
289     return codepoint;
290   }
291   const int singles_idx =
292       GetMatchIndex(kToLowerSingles, kNumToLowerSingles, codepoint);
293   if (singles_idx >= 0) {
294     return codepoint + kToLowerSinglesOffsets[singles_idx];
295   }
296   const int ranges_idx =
297       GetOverlappingRangeIndex(kToLowerRangesStart, kToLowerRangesEnd,
298                                kNumToLowerRangesStart, /*stride=*/1, codepoint);
299   if (ranges_idx >= 0) {
300     return codepoint + kToLowerRangesOffsets[ranges_idx];
301   }
302   return codepoint;
303 }
304 
GetPairedBracket(char32 codepoint) const305 char32 UniLib::GetPairedBracket(char32 codepoint) const {
306   const int open_offset =
307       GetMatchIndex(kOpeningBrackets, kNumOpeningBrackets, codepoint);
308   if (open_offset >= 0) {
309     return kClosingBrackets[open_offset];
310   }
311   const int close_offset =
312       GetMatchIndex(kClosingBrackets, kNumClosingBrackets, codepoint);
313   if (close_offset >= 0) {
314     return kOpeningBrackets[close_offset];
315   }
316   return codepoint;
317 }
318 
319 // -----------------------------------------------------------------------------
320 // Implementations that call out to JVM. Behold the beauty.
321 // -----------------------------------------------------------------------------
322 
ParseInt32(const UnicodeText & text,int * result) const323 bool UniLib::ParseInt32(const UnicodeText& text, int* result) const {
324   if (jni_cache_) {
325     JNIEnv* env = jni_cache_->GetEnv();
326     const ScopedLocalRef<jstring> text_java =
327         jni_cache_->ConvertToJavaString(text);
328     jint res = env->CallStaticIntMethod(jni_cache_->integer_class.get(),
329                                         jni_cache_->integer_parse_int,
330                                         text_java.get());
331     if (jni_cache_->ExceptionCheckAndClear()) {
332       return false;
333     }
334     *result = res;
335     return true;
336   }
337   return false;
338 }
339 
CreateRegexPattern(const UnicodeText & regex) const340 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern(
341     const UnicodeText& regex) const {
342   return std::unique_ptr<UniLib::RegexPattern>(
343       new UniLib::RegexPattern(jni_cache_.get(), regex, /*lazy=*/false));
344 }
345 
CreateLazyRegexPattern(const UnicodeText & regex) const346 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateLazyRegexPattern(
347     const UnicodeText& regex) const {
348   return std::unique_ptr<UniLib::RegexPattern>(
349       new UniLib::RegexPattern(jni_cache_.get(), regex, /*lazy=*/true));
350 }
351 
RegexPattern(const JniCache * jni_cache,const UnicodeText & pattern,bool lazy)352 UniLib::RegexPattern::RegexPattern(const JniCache* jni_cache,
353                                    const UnicodeText& pattern, bool lazy)
354     : jni_cache_(jni_cache),
355       pattern_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
356       initialized_(false),
357       initialization_failure_(false),
358       pattern_text_(pattern) {
359   if (!lazy) {
360     LockedInitializeIfNotAlready();
361   }
362 }
363 
LockedInitializeIfNotAlready() const364 void UniLib::RegexPattern::LockedInitializeIfNotAlready() const {
365   std::lock_guard<std::mutex> guard(mutex_);
366   if (initialized_ || initialization_failure_) {
367     return;
368   }
369 
370   if (jni_cache_) {
371     JNIEnv* jenv = jni_cache_->GetEnv();
372     const ScopedLocalRef<jstring> regex_java =
373         jni_cache_->ConvertToJavaString(pattern_text_);
374     pattern_ = MakeGlobalRef(jenv->CallStaticObjectMethod(
375                                  jni_cache_->pattern_class.get(),
376                                  jni_cache_->pattern_compile, regex_java.get()),
377                              jenv, jni_cache_->jvm);
378 
379     if (jni_cache_->ExceptionCheckAndClear() || pattern_ == nullptr) {
380       initialization_failure_ = true;
381       pattern_.reset();
382       return;
383     }
384 
385     initialized_ = true;
386     pattern_text_.clear();  // We don't need this anymore.
387   }
388 }
389 
390 constexpr int UniLib::RegexMatcher::kError;
391 constexpr int UniLib::RegexMatcher::kNoError;
392 
Matcher(const UnicodeText & context) const393 std::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher(
394     const UnicodeText& context) const {
395   LockedInitializeIfNotAlready();  // Possibly lazy initialization.
396   if (initialization_failure_) {
397     return nullptr;
398   }
399 
400   if (jni_cache_) {
401     JNIEnv* env = jni_cache_->GetEnv();
402     const jstring context_java =
403         jni_cache_->ConvertToJavaString(context).release();
404     if (!context_java) {
405       return nullptr;
406     }
407     const jobject matcher = env->CallObjectMethod(
408         pattern_.get(), jni_cache_->pattern_matcher, context_java);
409     if (jni_cache_->ExceptionCheckAndClear() || !matcher) {
410       return nullptr;
411     }
412     return std::unique_ptr<UniLib::RegexMatcher>(new RegexMatcher(
413         jni_cache_, MakeGlobalRef(matcher, env, jni_cache_->jvm),
414         MakeGlobalRef(context_java, env, jni_cache_->jvm)));
415   } else {
416     // NOTE: A valid object needs to be created here to pass the interface
417     // tests.
418     return std::unique_ptr<UniLib::RegexMatcher>(
419         new RegexMatcher(jni_cache_, nullptr, nullptr));
420   }
421 }
422 
RegexMatcher(const JniCache * jni_cache,ScopedGlobalRef<jobject> matcher,ScopedGlobalRef<jstring> text)423 UniLib::RegexMatcher::RegexMatcher(const JniCache* jni_cache,
424                                    ScopedGlobalRef<jobject> matcher,
425                                    ScopedGlobalRef<jstring> text)
426     : jni_cache_(jni_cache),
427       matcher_(std::move(matcher)),
428       text_(std::move(text)) {}
429 
Matches(int * status) const430 bool UniLib::RegexMatcher::Matches(int* status) const {
431   if (jni_cache_) {
432     *status = kNoError;
433     const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
434         matcher_.get(), jni_cache_->matcher_matches);
435     if (jni_cache_->ExceptionCheckAndClear()) {
436       *status = kError;
437       return false;
438     }
439     return result;
440   } else {
441     *status = kError;
442     return false;
443   }
444 }
445 
ApproximatelyMatches(int * status)446 bool UniLib::RegexMatcher::ApproximatelyMatches(int* status) {
447   *status = kNoError;
448 
449   jni_cache_->GetEnv()->CallObjectMethod(matcher_.get(),
450                                          jni_cache_->matcher_reset);
451   if (jni_cache_->ExceptionCheckAndClear()) {
452     *status = kError;
453     return kError;
454   }
455 
456   if (!Find(status) || *status != kNoError) {
457     return false;
458   }
459 
460   const int found_start = jni_cache_->GetEnv()->CallIntMethod(
461       matcher_.get(), jni_cache_->matcher_start_idx, 0);
462   if (jni_cache_->ExceptionCheckAndClear()) {
463     *status = kError;
464     return kError;
465   }
466 
467   const int found_end = jni_cache_->GetEnv()->CallIntMethod(
468       matcher_.get(), jni_cache_->matcher_end_idx, 0);
469   if (jni_cache_->ExceptionCheckAndClear()) {
470     *status = kError;
471     return kError;
472   }
473 
474   int context_length_bmp = jni_cache_->GetEnv()->CallIntMethod(
475       text_.get(), jni_cache_->string_length);
476   if (jni_cache_->ExceptionCheckAndClear()) {
477     *status = kError;
478     return false;
479   }
480 
481   if (found_start != 0 || found_end != context_length_bmp) {
482     return false;
483   }
484 
485   return true;
486 }
487 
UpdateLastFindOffset() const488 bool UniLib::RegexMatcher::UpdateLastFindOffset() const {
489   if (!last_find_offset_dirty_) {
490     return true;
491   }
492 
493   const int find_offset = jni_cache_->GetEnv()->CallIntMethod(
494       matcher_.get(), jni_cache_->matcher_start_idx, 0);
495   if (jni_cache_->ExceptionCheckAndClear()) {
496     return false;
497   }
498 
499   const int codepoint_count = jni_cache_->GetEnv()->CallIntMethod(
500       text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
501       find_offset);
502   if (jni_cache_->ExceptionCheckAndClear()) {
503     return false;
504   }
505 
506   last_find_offset_codepoints_ += codepoint_count;
507   last_find_offset_ = find_offset;
508   last_find_offset_dirty_ = false;
509 
510   return true;
511 }
512 
Find(int * status)513 bool UniLib::RegexMatcher::Find(int* status) {
514   if (jni_cache_) {
515     const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
516         matcher_.get(), jni_cache_->matcher_find);
517     if (jni_cache_->ExceptionCheckAndClear()) {
518       *status = kError;
519       return false;
520     }
521 
522     last_find_offset_dirty_ = true;
523     *status = kNoError;
524     return result;
525   } else {
526     *status = kError;
527     return false;
528   }
529 }
530 
Start(int * status) const531 int UniLib::RegexMatcher::Start(int* status) const {
532   return Start(/*group_idx=*/0, status);
533 }
534 
Start(int group_idx,int * status) const535 int UniLib::RegexMatcher::Start(int group_idx, int* status) const {
536   if (jni_cache_) {
537     *status = kNoError;
538 
539     if (!UpdateLastFindOffset()) {
540       *status = kError;
541       return kError;
542     }
543 
544     const int java_index = jni_cache_->GetEnv()->CallIntMethod(
545         matcher_.get(), jni_cache_->matcher_start_idx, group_idx);
546     if (jni_cache_->ExceptionCheckAndClear()) {
547       *status = kError;
548       return kError;
549     }
550 
551     // If the group didn't participate in the match the index is -1.
552     if (java_index == -1) {
553       return -1;
554     }
555 
556     const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
557         text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
558         java_index);
559     if (jni_cache_->ExceptionCheckAndClear()) {
560       *status = kError;
561       return kError;
562     }
563 
564     return unicode_index + last_find_offset_codepoints_;
565   } else {
566     *status = kError;
567     return kError;
568   }
569 }
570 
End(int * status) const571 int UniLib::RegexMatcher::End(int* status) const {
572   return End(/*group_idx=*/0, status);
573 }
574 
End(int group_idx,int * status) const575 int UniLib::RegexMatcher::End(int group_idx, int* status) const {
576   if (jni_cache_) {
577     *status = kNoError;
578 
579     if (!UpdateLastFindOffset()) {
580       *status = kError;
581       return kError;
582     }
583 
584     const int java_index = jni_cache_->GetEnv()->CallIntMethod(
585         matcher_.get(), jni_cache_->matcher_end_idx, group_idx);
586     if (jni_cache_->ExceptionCheckAndClear()) {
587       *status = kError;
588       return kError;
589     }
590 
591     // If the group didn't participate in the match the index is -1.
592     if (java_index == -1) {
593       return -1;
594     }
595 
596     const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
597         text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
598         java_index);
599     if (jni_cache_->ExceptionCheckAndClear()) {
600       *status = kError;
601       return kError;
602     }
603 
604     return unicode_index + last_find_offset_codepoints_;
605   } else {
606     *status = kError;
607     return kError;
608   }
609 }
610 
Group(int * status) const611 UnicodeText UniLib::RegexMatcher::Group(int* status) const {
612   if (jni_cache_) {
613     JNIEnv* jenv = jni_cache_->GetEnv();
614     const ScopedLocalRef<jstring> java_result(
615         reinterpret_cast<jstring>(
616             jenv->CallObjectMethod(matcher_.get(), jni_cache_->matcher_group)),
617         jenv);
618     if (jni_cache_->ExceptionCheckAndClear() || !java_result) {
619       *status = kError;
620       return UTF8ToUnicodeText("", /*do_copy=*/false);
621     }
622 
623     std::string result;
624     if (!JStringToUtf8String(jenv, java_result.get(), &result)) {
625       *status = kError;
626       return UTF8ToUnicodeText("", /*do_copy=*/false);
627     }
628     *status = kNoError;
629     return UTF8ToUnicodeText(result, /*do_copy=*/true);
630   } else {
631     *status = kError;
632     return UTF8ToUnicodeText("", /*do_copy=*/false);
633   }
634 }
635 
Group(int group_idx,int * status) const636 UnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const {
637   if (jni_cache_) {
638     JNIEnv* jenv = jni_cache_->GetEnv();
639     const ScopedLocalRef<jstring> java_result(
640         reinterpret_cast<jstring>(jenv->CallObjectMethod(
641             matcher_.get(), jni_cache_->matcher_group_idx, group_idx)),
642         jenv);
643     if (jni_cache_->ExceptionCheckAndClear()) {
644       *status = kError;
645       TC3_LOG(ERROR) << "Exception occurred";
646       return UTF8ToUnicodeText("", /*do_copy=*/false);
647     }
648 
649     // java_result is nullptr when the group did not participate in the match.
650     // For these cases other UniLib implementations return empty string, and
651     // the participation can be checked by checking if Start() == -1.
652     if (!java_result) {
653       *status = kNoError;
654       return UTF8ToUnicodeText("", /*do_copy=*/false);
655     }
656 
657     std::string result;
658     if (!JStringToUtf8String(jenv, java_result.get(), &result)) {
659       *status = kError;
660       return UTF8ToUnicodeText("", /*do_copy=*/false);
661     }
662     *status = kNoError;
663     return UTF8ToUnicodeText(result, /*do_copy=*/true);
664   } else {
665     *status = kError;
666     return UTF8ToUnicodeText("", /*do_copy=*/false);
667   }
668 }
669 
670 constexpr int UniLib::BreakIterator::kDone;
671 
BreakIterator(const JniCache * jni_cache,const UnicodeText & text)672 UniLib::BreakIterator::BreakIterator(const JniCache* jni_cache,
673                                      const UnicodeText& text)
674     : jni_cache_(jni_cache),
675       text_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
676       iterator_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
677       last_break_index_(0),
678       last_unicode_index_(0) {
679   if (jni_cache_) {
680     JNIEnv* jenv = jni_cache_->GetEnv();
681     text_ = MakeGlobalRef(jni_cache_->ConvertToJavaString(text).release(), jenv,
682                           jni_cache->jvm);
683     if (!text_) {
684       return;
685     }
686 
687     iterator_ = MakeGlobalRef(
688         jenv->CallStaticObjectMethod(jni_cache->breakiterator_class.get(),
689                                      jni_cache->breakiterator_getwordinstance,
690                                      jni_cache->locale_us.get()),
691         jenv, jni_cache->jvm);
692     if (!iterator_) {
693       return;
694     }
695     jenv->CallVoidMethod(iterator_.get(), jni_cache->breakiterator_settext,
696                          text_.get());
697   }
698 }
699 
Next()700 int UniLib::BreakIterator::Next() {
701   if (jni_cache_) {
702     const int break_index = jni_cache_->GetEnv()->CallIntMethod(
703         iterator_.get(), jni_cache_->breakiterator_next);
704     if (jni_cache_->ExceptionCheckAndClear() ||
705         break_index == BreakIterator::kDone) {
706       return BreakIterator::kDone;
707     }
708 
709     const int token_unicode_length = jni_cache_->GetEnv()->CallIntMethod(
710         text_.get(), jni_cache_->string_code_point_count, last_break_index_,
711         break_index);
712     if (jni_cache_->ExceptionCheckAndClear()) {
713       return BreakIterator::kDone;
714     }
715 
716     last_break_index_ = break_index;
717     return last_unicode_index_ += token_unicode_length;
718   }
719   return BreakIterator::kDone;
720 }
721 
CreateBreakIterator(const UnicodeText & text) const722 std::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator(
723     const UnicodeText& text) const {
724   return std::unique_ptr<UniLib::BreakIterator>(
725       new UniLib::BreakIterator(jni_cache_.get(), text));
726 }
727 
728 }  // namespace libtextclassifier3
729