1 /*
2  *************************************************************************
3  * COPYRIGHT:
4  * Copyright (c) 1996-2012, International Business Machines Corporation and
5  * others. All Rights Reserved.
6  *************************************************************************
7  */
8 
9 #include "unicode/utypes.h"
10 
11 #if !UCONFIG_NO_NORMALIZATION
12 
13 #include "unicode/uniset.h"
14 #include "unicode/unistr.h"
15 #include "unicode/chariter.h"
16 #include "unicode/schriter.h"
17 #include "unicode/uchriter.h"
18 #include "unicode/normlzr.h"
19 #include "unicode/utf16.h"
20 #include "cmemory.h"
21 #include "normalizer2impl.h"
22 #include "uprops.h"  // for uniset_getUnicode32Instance()
23 
24 U_NAMESPACE_BEGIN
25 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
27 
28 //-------------------------------------------------------------------------
29 // Constructors and other boilerplate
30 //-------------------------------------------------------------------------
31 
32 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
33     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
34     text(new StringCharacterIterator(str)),
35     currentIndex(0), nextIndex(0),
36     buffer(), bufferPos(0)
37 {
38     init();
39 }
40 
Normalizer(const UChar * str,int32_t length,UNormalizationMode mode)41 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
42     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
43     text(new UCharCharacterIterator(str, length)),
44     currentIndex(0), nextIndex(0),
45     buffer(), bufferPos(0)
46 {
47     init();
48 }
49 
Normalizer(const CharacterIterator & iter,UNormalizationMode mode)50 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
51     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
52     text(iter.clone()),
53     currentIndex(0), nextIndex(0),
54     buffer(), bufferPos(0)
55 {
56     init();
57 }
58 
Normalizer(const Normalizer & copy)59 Normalizer::Normalizer(const Normalizer &copy) :
60     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
61     text(copy.text->clone()),
62     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
63     buffer(copy.buffer), bufferPos(copy.bufferPos)
64 {
65     init();
66 }
67 
68 void
init()69 Normalizer::init() {
70     UErrorCode errorCode=U_ZERO_ERROR;
71     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
72     if(fOptions&UNORM_UNICODE_3_2) {
73         delete fFilteredNorm2;
74         fNorm2=fFilteredNorm2=
75             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
76     }
77     if(U_FAILURE(errorCode)) {
78         errorCode=U_ZERO_ERROR;
79         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
80     }
81 }
82 
~Normalizer()83 Normalizer::~Normalizer()
84 {
85     delete fFilteredNorm2;
86     delete text;
87 }
88 
89 Normalizer*
clone() const90 Normalizer::clone() const
91 {
92     return new Normalizer(*this);
93 }
94 
95 /**
96  * Generates a hash code for this iterator.
97  */
hashCode() const98 int32_t Normalizer::hashCode() const
99 {
100     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
101 }
102 
operator ==(const Normalizer & that) const103 UBool Normalizer::operator==(const Normalizer& that) const
104 {
105     return
106         this==&that ||
107         (fUMode==that.fUMode &&
108         fOptions==that.fOptions &&
109         *text==*that.text &&
110         buffer==that.buffer &&
111         bufferPos==that.bufferPos &&
112         nextIndex==that.nextIndex);
113 }
114 
115 //-------------------------------------------------------------------------
116 // Static utility methods
117 //-------------------------------------------------------------------------
118 
119 void U_EXPORT2
normalize(const UnicodeString & source,UNormalizationMode mode,int32_t options,UnicodeString & result,UErrorCode & status)120 Normalizer::normalize(const UnicodeString& source,
121                       UNormalizationMode mode, int32_t options,
122                       UnicodeString& result,
123                       UErrorCode &status) {
124     if(source.isBogus() || U_FAILURE(status)) {
125         result.setToBogus();
126         if(U_SUCCESS(status)) {
127             status=U_ILLEGAL_ARGUMENT_ERROR;
128         }
129     } else {
130         UnicodeString localDest;
131         UnicodeString *dest;
132 
133         if(&source!=&result) {
134             dest=&result;
135         } else {
136             // the source and result strings are the same object, use a temporary one
137             dest=&localDest;
138         }
139         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
140         if(U_SUCCESS(status)) {
141             if(options&UNORM_UNICODE_3_2) {
142                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
143                     normalize(source, *dest, status);
144             } else {
145                 n2->normalize(source, *dest, status);
146             }
147         }
148         if(dest==&localDest && U_SUCCESS(status)) {
149             result=*dest;
150         }
151     }
152 }
153 
154 void U_EXPORT2
compose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)155 Normalizer::compose(const UnicodeString& source,
156                     UBool compat, int32_t options,
157                     UnicodeString& result,
158                     UErrorCode &status) {
159     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
160 }
161 
162 void U_EXPORT2
decompose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)163 Normalizer::decompose(const UnicodeString& source,
164                       UBool compat, int32_t options,
165                       UnicodeString& result,
166                       UErrorCode &status) {
167     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
168 }
169 
170 UNormalizationCheckResult
quickCheck(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)171 Normalizer::quickCheck(const UnicodeString& source,
172                        UNormalizationMode mode, int32_t options,
173                        UErrorCode &status) {
174     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
175     if(U_SUCCESS(status)) {
176         if(options&UNORM_UNICODE_3_2) {
177             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
178                 quickCheck(source, status);
179         } else {
180             return n2->quickCheck(source, status);
181         }
182     } else {
183         return UNORM_MAYBE;
184     }
185 }
186 
187 UBool
isNormalized(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)188 Normalizer::isNormalized(const UnicodeString& source,
189                          UNormalizationMode mode, int32_t options,
190                          UErrorCode &status) {
191     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
192     if(U_SUCCESS(status)) {
193         if(options&UNORM_UNICODE_3_2) {
194             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
195                 isNormalized(source, status);
196         } else {
197             return n2->isNormalized(source, status);
198         }
199     } else {
200         return FALSE;
201     }
202 }
203 
204 UnicodeString & U_EXPORT2
concatenate(const UnicodeString & left,const UnicodeString & right,UnicodeString & result,UNormalizationMode mode,int32_t options,UErrorCode & errorCode)205 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
206                         UnicodeString &result,
207                         UNormalizationMode mode, int32_t options,
208                         UErrorCode &errorCode) {
209     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
210         result.setToBogus();
211         if(U_SUCCESS(errorCode)) {
212             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
213         }
214     } else {
215         UnicodeString localDest;
216         UnicodeString *dest;
217 
218         if(&right!=&result) {
219             dest=&result;
220         } else {
221             // the right and result strings are the same object, use a temporary one
222             dest=&localDest;
223         }
224         *dest=left;
225         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
226         if(U_SUCCESS(errorCode)) {
227             if(options&UNORM_UNICODE_3_2) {
228                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
229                     append(*dest, right, errorCode);
230             } else {
231                 n2->append(*dest, right, errorCode);
232             }
233         }
234         if(dest==&localDest && U_SUCCESS(errorCode)) {
235             result=*dest;
236         }
237     }
238     return result;
239 }
240 
241 //-------------------------------------------------------------------------
242 // Iteration API
243 //-------------------------------------------------------------------------
244 
245 /**
246  * Return the current character in the normalized text.
247  */
current()248 UChar32 Normalizer::current() {
249     if(bufferPos<buffer.length() || nextNormalize()) {
250         return buffer.char32At(bufferPos);
251     } else {
252         return DONE;
253     }
254 }
255 
256 /**
257  * Return the next character in the normalized text and advance
258  * the iteration position by one.  If the end
259  * of the text has already been reached, {@link #DONE} is returned.
260  */
next()261 UChar32 Normalizer::next() {
262     if(bufferPos<buffer.length() ||  nextNormalize()) {
263         UChar32 c=buffer.char32At(bufferPos);
264         bufferPos+=U16_LENGTH(c);
265         return c;
266     } else {
267         return DONE;
268     }
269 }
270 
271 /**
272  * Return the previous character in the normalized text and decrement
273  * the iteration position by one.  If the beginning
274  * of the text has already been reached, {@link #DONE} is returned.
275  */
previous()276 UChar32 Normalizer::previous() {
277     if(bufferPos>0 || previousNormalize()) {
278         UChar32 c=buffer.char32At(bufferPos-1);
279         bufferPos-=U16_LENGTH(c);
280         return c;
281     } else {
282         return DONE;
283     }
284 }
285 
reset()286 void Normalizer::reset() {
287     currentIndex=nextIndex=text->setToStart();
288     clearBuffer();
289 }
290 
291 void
setIndexOnly(int32_t index)292 Normalizer::setIndexOnly(int32_t index) {
293     text->setIndex(index);  // pins index
294     currentIndex=nextIndex=text->getIndex();
295     clearBuffer();
296 }
297 
298 /**
299  * Return the first character in the normalized text.  This resets
300  * the <tt>Normalizer's</tt> position to the beginning of the text.
301  */
first()302 UChar32 Normalizer::first() {
303     reset();
304     return next();
305 }
306 
307 /**
308  * Return the last character in the normalized text.  This resets
309  * the <tt>Normalizer's</tt> position to be just before the
310  * the input text corresponding to that normalized character.
311  */
last()312 UChar32 Normalizer::last() {
313     currentIndex=nextIndex=text->setToEnd();
314     clearBuffer();
315     return previous();
316 }
317 
318 /**
319  * Retrieve the current iteration position in the input text that is
320  * being normalized.  This method is useful in applications such as
321  * searching, where you need to be able to determine the position in
322  * the input text that corresponds to a given normalized output character.
323  * <p>
324  * <b>Note:</b> This method sets the position in the <em>input</em>, while
325  * {@link #next} and {@link #previous} iterate through characters in the
326  * <em>output</em>.  This means that there is not necessarily a one-to-one
327  * correspondence between characters returned by <tt>next</tt> and
328  * <tt>previous</tt> and the indices passed to and returned from
329  * <tt>setIndex</tt> and {@link #getIndex}.
330  *
331  */
getIndex() const332 int32_t Normalizer::getIndex() const {
333     if(bufferPos<buffer.length()) {
334         return currentIndex;
335     } else {
336         return nextIndex;
337     }
338 }
339 
340 /**
341  * Retrieve the index of the start of the input text.  This is the begin index
342  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
343  * over which this <tt>Normalizer</tt> is iterating
344  */
startIndex() const345 int32_t Normalizer::startIndex() const {
346     return text->startIndex();
347 }
348 
349 /**
350  * Retrieve the index of the end of the input text.  This is the end index
351  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
352  * over which this <tt>Normalizer</tt> is iterating
353  */
endIndex() const354 int32_t Normalizer::endIndex() const {
355     return text->endIndex();
356 }
357 
358 //-------------------------------------------------------------------------
359 // Property access methods
360 //-------------------------------------------------------------------------
361 
362 void
setMode(UNormalizationMode newMode)363 Normalizer::setMode(UNormalizationMode newMode)
364 {
365     fUMode = newMode;
366     init();
367 }
368 
369 UNormalizationMode
getUMode() const370 Normalizer::getUMode() const
371 {
372     return fUMode;
373 }
374 
375 void
setOption(int32_t option,UBool value)376 Normalizer::setOption(int32_t option,
377                       UBool value)
378 {
379     if (value) {
380         fOptions |= option;
381     } else {
382         fOptions &= (~option);
383     }
384     init();
385 }
386 
387 UBool
getOption(int32_t option) const388 Normalizer::getOption(int32_t option) const
389 {
390     return (fOptions & option) != 0;
391 }
392 
393 /**
394  * Set the input text over which this <tt>Normalizer</tt> will iterate.
395  * The iteration position is set to the beginning of the input text.
396  */
397 void
setText(const UnicodeString & newText,UErrorCode & status)398 Normalizer::setText(const UnicodeString& newText,
399                     UErrorCode &status)
400 {
401     if (U_FAILURE(status)) {
402         return;
403     }
404     CharacterIterator *newIter = new StringCharacterIterator(newText);
405     if (newIter == NULL) {
406         status = U_MEMORY_ALLOCATION_ERROR;
407         return;
408     }
409     delete text;
410     text = newIter;
411     reset();
412 }
413 
414 /**
415  * Set the input text over which this <tt>Normalizer</tt> will iterate.
416  * The iteration position is set to the beginning of the string.
417  */
418 void
setText(const CharacterIterator & newText,UErrorCode & status)419 Normalizer::setText(const CharacterIterator& newText,
420                     UErrorCode &status)
421 {
422     if (U_FAILURE(status)) {
423         return;
424     }
425     CharacterIterator *newIter = newText.clone();
426     if (newIter == NULL) {
427         status = U_MEMORY_ALLOCATION_ERROR;
428         return;
429     }
430     delete text;
431     text = newIter;
432     reset();
433 }
434 
435 void
setText(const UChar * newText,int32_t length,UErrorCode & status)436 Normalizer::setText(const UChar* newText,
437                     int32_t length,
438                     UErrorCode &status)
439 {
440     if (U_FAILURE(status)) {
441         return;
442     }
443     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
444     if (newIter == NULL) {
445         status = U_MEMORY_ALLOCATION_ERROR;
446         return;
447     }
448     delete text;
449     text = newIter;
450     reset();
451 }
452 
453 /**
454  * Copies the text under iteration into the UnicodeString referred to by "result".
455  * @param result Receives a copy of the text under iteration.
456  */
457 void
getText(UnicodeString & result)458 Normalizer::getText(UnicodeString&  result)
459 {
460     text->getText(result);
461 }
462 
463 //-------------------------------------------------------------------------
464 // Private utility methods
465 //-------------------------------------------------------------------------
466 
clearBuffer()467 void Normalizer::clearBuffer() {
468     buffer.remove();
469     bufferPos=0;
470 }
471 
472 UBool
nextNormalize()473 Normalizer::nextNormalize() {
474     clearBuffer();
475     currentIndex=nextIndex;
476     text->setIndex(nextIndex);
477     if(!text->hasNext()) {
478         return FALSE;
479     }
480     // Skip at least one character so we make progress.
481     UnicodeString segment(text->next32PostInc());
482     while(text->hasNext()) {
483         UChar32 c;
484         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
485             text->move32(-1, CharacterIterator::kCurrent);
486             break;
487         }
488         segment.append(c);
489     }
490     nextIndex=text->getIndex();
491     UErrorCode errorCode=U_ZERO_ERROR;
492     fNorm2->normalize(segment, buffer, errorCode);
493     return U_SUCCESS(errorCode) && !buffer.isEmpty();
494 }
495 
496 UBool
previousNormalize()497 Normalizer::previousNormalize() {
498     clearBuffer();
499     nextIndex=currentIndex;
500     text->setIndex(currentIndex);
501     if(!text->hasPrevious()) {
502         return FALSE;
503     }
504     UnicodeString segment;
505     while(text->hasPrevious()) {
506         UChar32 c=text->previous32();
507         segment.insert(0, c);
508         if(fNorm2->hasBoundaryBefore(c)) {
509             break;
510         }
511     }
512     currentIndex=text->getIndex();
513     UErrorCode errorCode=U_ZERO_ERROR;
514     fNorm2->normalize(segment, buffer, errorCode);
515     bufferPos=buffer.length();
516     return U_SUCCESS(errorCode) && !buffer.isEmpty();
517 }
518 
519 U_NAMESPACE_END
520 
521 #endif /* #if !UCONFIG_NO_NORMALIZATION */
522