1 /*
2  * Copyright (C) 2010 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Matcher"
18 
19 #include <memory>
20 #include <stdlib.h>
21 
22 #include <android-base/logging.h>
23 #include <nativehelper/JNIHelp.h>
24 #include <nativehelper/ScopedPrimitiveArray.h>
25 #include <nativehelper/ScopedStringChars.h>
26 #include <nativehelper/jni_macros.h>
27 
28 #include "IcuUtilities.h"
29 #include "JniException.h"
30 #include "ScopedJavaUnicodeString.h"
31 #include "unicode/parseerr.h"
32 #include "unicode/regex.h"
33 
34 // ICU documentation: http://icu-project.org/apiref/icu4c/classRegexMatcher.html
35 
36 /**
37  * Encapsulates an instance of ICU4C's RegexMatcher class along with a copy of
38  * the input it's currently operating on in the native heap.
39  *
40  * Rationale: We choose to make a copy here because it turns out to be a lot
41  * cheaper when a moving GC and/or string compression is enabled. This is
42  * because env->GetStringChars() always copies in this scenario. This becomes
43  * especially bad when the String in question is long and/or contains a large
44  * number of matches.
45  *
46  * Drawbacks: The native allocation associated with this class is no longer
47  * fixed size, so we're effectively lying to the NativeAllocationRegistry about
48  * the size of the object(s) we're allocating on the native heap. The peak
49  * memory usage doesn't change though, given that GetStringChars would have
50  * made an allocation of precisely the same size.
51  */
52 class MatcherState {
53 public:
MatcherState(icu::RegexMatcher * matcher)54     MatcherState(icu::RegexMatcher* matcher) :
55         mMatcher(matcher),
56         mUChars(nullptr),
57         mUText(nullptr),
58         mStatus(U_ZERO_ERROR) {
59     }
60 
updateInput(JNIEnv * env,jstring input)61     bool updateInput(JNIEnv* env, jstring input) {
62         // First, close the UText struct, since we're about to allocate a new one.
63         if (mUText != nullptr) {
64             utext_close(mUText);
65             mUText = nullptr;
66         }
67 
68         // Then delete the UChar* associated with the UText struct..
69         mUChars.reset(nullptr);
70 
71         // TODO: We should investigate whether we can avoid an additional copy
72         // in the native heap when is_copy == JNI_TRUE. The problem with doing
73         // that is that we might call ReleaseStringChars with a different
74         // JNIEnv* on a different downcall. This is currently safe as
75         // implemented in ART, but is unlikely to be portable and the spec stays
76         // silent on the matter.
77         ScopedStringChars inputChars(env, input);
78         if (inputChars.get() == nullptr) {
79             // There will be an exception pending if we get here.
80             return false;
81         }
82 
83         // Make a copy of |input| on the native heap. This copy will be live
84         // until the next call to updateInput or close.
85         mUChars.reset(new (std::nothrow) UChar[inputChars.size()]);
86         if (mUChars.get() == nullptr) {
87             env->ThrowNew(env->FindClass("Ljava/lang/OutOfMemoryError;"), "Out of memory");
88             return false;
89         }
90 
91         static_assert(sizeof(UChar) == sizeof(jchar), "sizeof(Uchar) != sizeof(jchar)");
92         memcpy(mUChars.get(), inputChars.get(), inputChars.size() * sizeof(jchar));
93 
94         // Reset any errors that might have occurred on previous patches.
95         mStatus = U_ZERO_ERROR;
96         mUText = utext_openUChars(nullptr, mUChars.get(), inputChars.size(), &mStatus);
97         if (mUText == nullptr) {
98             CHECK(maybeThrowIcuException(env, "utext_openUChars", mStatus));
99             return false;
100         }
101 
102         // It is an error for ICU to have returned a non-null mUText but to
103         // still have indicated an error.
104         CHECK(U_SUCCESS(mStatus));
105 
106         mMatcher->reset(mUText);
107         return true;
108     }
109 
~MatcherState()110     ~MatcherState() {
111         if (mUText != nullptr) {
112             utext_close(mUText);
113         }
114     }
115 
matcher()116     icu::RegexMatcher* matcher() {
117         return mMatcher.get();
118     }
119 
status()120     UErrorCode& status() {
121         return mStatus;
122     }
123 
updateOffsets(JNIEnv * env,jintArray javaOffsets)124     void updateOffsets(JNIEnv* env, jintArray javaOffsets) {
125         ScopedIntArrayRW offsets(env, javaOffsets);
126         if (offsets.get() == NULL) {
127             return;
128         }
129 
130         for (size_t i = 0, groupCount = mMatcher->groupCount(); i <= groupCount; ++i) {
131             offsets[2*i + 0] = mMatcher->start(i, mStatus);
132             offsets[2*i + 1] = mMatcher->end(i, mStatus);
133         }
134     }
135 
136 private:
137     std::unique_ptr<icu::RegexMatcher> mMatcher;
138     std::unique_ptr<UChar[]> mUChars;
139     UText* mUText;
140     UErrorCode mStatus;
141 
142     // Disallow copy and assignment.
143     MatcherState(const MatcherState&);
144     void operator=(const MatcherState&);
145 };
146 
toMatcherState(jlong address)147 static inline MatcherState* toMatcherState(jlong address) {
148     return reinterpret_cast<MatcherState*>(static_cast<uintptr_t>(address));
149 }
150 
Matcher_free(void * address)151 static void Matcher_free(void* address) {
152     MatcherState* state = reinterpret_cast<MatcherState*>(address);
153     delete state;
154 }
155 
Matcher_getNativeFinalizer(JNIEnv *,jclass)156 static jlong Matcher_getNativeFinalizer(JNIEnv*, jclass) {
157     return reinterpret_cast<jlong>(&Matcher_free);
158 }
159 
Matcher_findImpl(JNIEnv * env,jclass,jlong addr,jint startIndex,jintArray offsets)160 static jboolean Matcher_findImpl(JNIEnv* env, jclass, jlong addr, jint startIndex, jintArray offsets) {
161     MatcherState* state = toMatcherState(addr);
162     UBool result = state->matcher()->find(startIndex, state->status());
163     if (result) {
164         state->updateOffsets(env, offsets);
165         return JNI_TRUE;
166     } else {
167         return JNI_FALSE;
168     }
169 }
170 
Matcher_findNextImpl(JNIEnv * env,jclass,jlong addr,jintArray offsets)171 static jboolean Matcher_findNextImpl(JNIEnv* env, jclass, jlong addr, jintArray offsets) {
172     MatcherState* state = toMatcherState(addr);
173     UBool result = state->matcher()->find();
174     if (result) {
175         state->updateOffsets(env, offsets);
176         return JNI_TRUE;
177     } else {
178         return JNI_FALSE;
179     }
180 }
181 
Matcher_groupCountImpl(JNIEnv *,jclass,jlong addr)182 static jint Matcher_groupCountImpl(JNIEnv*, jclass, jlong addr) {
183     MatcherState* state = toMatcherState(addr);
184     return state->matcher()->groupCount();
185 }
186 
Matcher_hitEndImpl(JNIEnv *,jclass,jlong addr)187 static jboolean Matcher_hitEndImpl(JNIEnv*, jclass, jlong addr) {
188     MatcherState* state = toMatcherState(addr);
189     if (state->matcher()->hitEnd() != 0) {
190         return JNI_TRUE;
191     } else {
192         return JNI_FALSE;
193     }
194 }
195 
Matcher_lookingAtImpl(JNIEnv * env,jclass,jlong addr,jintArray offsets)196 static jboolean Matcher_lookingAtImpl(JNIEnv* env, jclass, jlong addr, jintArray offsets) {
197     MatcherState* state = toMatcherState(addr);
198     UBool result = state->matcher()->lookingAt(state->status());
199     if (result) {
200         state->updateOffsets(env, offsets);
201         return JNI_TRUE;
202     } else {
203         return JNI_FALSE;
204     }
205 }
206 
Matcher_matchesImpl(JNIEnv * env,jclass,jlong addr,jintArray offsets)207 static jboolean Matcher_matchesImpl(JNIEnv* env, jclass, jlong addr, jintArray offsets) {
208     MatcherState* state = toMatcherState(addr);
209     UBool result = state->matcher()->matches(state->status());
210     if (result) {
211         state->updateOffsets(env, offsets);
212         return JNI_TRUE;
213     } else {
214         return JNI_FALSE;
215     }
216 }
217 
Matcher_openImpl(JNIEnv * env,jclass,jlong patternAddr)218 static jlong Matcher_openImpl(JNIEnv* env, jclass, jlong patternAddr) {
219     icu::RegexPattern* pattern = reinterpret_cast<icu::RegexPattern*>(static_cast<uintptr_t>(patternAddr));
220     UErrorCode status = U_ZERO_ERROR;
221     icu::RegexMatcher* result = pattern->matcher(status);
222     if (maybeThrowIcuException(env, "RegexPattern::matcher", status)) {
223         return 0;
224     }
225 
226     return reinterpret_cast<uintptr_t>(new MatcherState(result));
227 }
228 
Matcher_requireEndImpl(JNIEnv *,jclass,jlong addr)229 static jboolean Matcher_requireEndImpl(JNIEnv*, jclass, jlong addr) {
230     MatcherState* state = toMatcherState(addr);
231     if (state->matcher()->requireEnd() != 0) {
232         return JNI_TRUE;
233     } else {
234         return JNI_FALSE;
235     }
236 }
237 
Matcher_setInputImpl(JNIEnv * env,jclass,jlong addr,jstring javaText,jint start,jint end)238 static void Matcher_setInputImpl(JNIEnv* env, jclass, jlong addr, jstring javaText, jint start, jint end) {
239     MatcherState* state = toMatcherState(addr);
240     if (state->updateInput(env, javaText)) {
241         state->matcher()->region(start, end, state->status());
242     }
243 }
244 
Matcher_useAnchoringBoundsImpl(JNIEnv *,jclass,jlong addr,jboolean value)245 static void Matcher_useAnchoringBoundsImpl(JNIEnv*, jclass, jlong addr, jboolean value) {
246     MatcherState* state = toMatcherState(addr);
247     state->matcher()->useAnchoringBounds(value);
248 }
249 
Matcher_useTransparentBoundsImpl(JNIEnv *,jclass,jlong addr,jboolean value)250 static void Matcher_useTransparentBoundsImpl(JNIEnv*, jclass, jlong addr, jboolean value) {
251     MatcherState* state = toMatcherState(addr);
252     state->matcher()->useTransparentBounds(value);
253 }
254 
Matcher_getMatchedGroupIndex0(JNIEnv * env,jclass,jlong patternAddr,jstring javaGroupName)255 static jint Matcher_getMatchedGroupIndex0(JNIEnv* env, jclass, jlong patternAddr, jstring javaGroupName) {
256   icu::RegexPattern* pattern = reinterpret_cast<icu::RegexPattern*>(static_cast<uintptr_t>(patternAddr));
257   ScopedJavaUnicodeString groupName(env, javaGroupName);
258   UErrorCode status = U_ZERO_ERROR;
259 
260   jint result = pattern->groupNumberFromName(groupName.unicodeString(), status);
261   if (U_SUCCESS(status)) {
262     return result;
263   }
264   if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
265     return -1;
266   }
267   maybeThrowIcuException(env, "RegexPattern::groupNumberFromName", status);
268   return -1;
269 }
270 
271 
272 static JNINativeMethod gMethods[] = {
273     NATIVE_METHOD(Matcher, getMatchedGroupIndex0, "(JLjava/lang/String;)I"),
274     NATIVE_METHOD(Matcher, findImpl, "(JI[I)Z"),
275     NATIVE_METHOD(Matcher, findNextImpl, "(J[I)Z"),
276     NATIVE_METHOD(Matcher, getNativeFinalizer, "()J"),
277     NATIVE_METHOD(Matcher, groupCountImpl, "(J)I"),
278     NATIVE_METHOD(Matcher, hitEndImpl, "(J)Z"),
279     NATIVE_METHOD(Matcher, lookingAtImpl, "(J[I)Z"),
280     NATIVE_METHOD(Matcher, matchesImpl, "(J[I)Z"),
281     NATIVE_METHOD(Matcher, openImpl, "(J)J"),
282     NATIVE_METHOD(Matcher, requireEndImpl, "(J)Z"),
283     NATIVE_METHOD(Matcher, setInputImpl, "(JLjava/lang/String;II)V"),
284     NATIVE_METHOD(Matcher, useAnchoringBoundsImpl, "(JZ)V"),
285     NATIVE_METHOD(Matcher, useTransparentBoundsImpl, "(JZ)V"),
286 };
register_java_util_regex_Matcher(JNIEnv * env)287 void register_java_util_regex_Matcher(JNIEnv* env) {
288     jniRegisterNativeMethods(env, "java/util/regex/Matcher", gMethods, NELEM(gMethods));
289 }
290