1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2014, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  normalizer2impl.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009nov22
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_NORMALIZATION
20 
21 #include "unicode/normalizer2.h"
22 #include "unicode/udata.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
25 #include "cmemory.h"
26 #include "mutex.h"
27 #include "normalizer2impl.h"
28 #include "putilimp.h"
29 #include "uassert.h"
30 #include "uset_imp.h"
31 #include "utrie2.h"
32 #include "uvector.h"
33 
34 U_NAMESPACE_BEGIN
35 
36 // ReorderingBuffer -------------------------------------------------------- ***
37 
init(int32_t destCapacity,UErrorCode & errorCode)38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
39     int32_t length=str.length();
40     start=str.getBuffer(destCapacity);
41     if(start==NULL) {
42         // getBuffer() already did str.setToBogus()
43         errorCode=U_MEMORY_ALLOCATION_ERROR;
44         return FALSE;
45     }
46     limit=start+length;
47     remainingCapacity=str.getCapacity()-length;
48     reorderStart=start;
49     if(start==limit) {
50         lastCC=0;
51     } else {
52         setIterator();
53         lastCC=previousCC();
54         // Set reorderStart after the last code point with cc<=1 if there is one.
55         if(lastCC>1) {
56             while(previousCC()>1) {}
57         }
58         reorderStart=codePointLimit;
59     }
60     return TRUE;
61 }
62 
equals(const UChar * otherStart,const UChar * otherLimit) const63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
64     int32_t length=(int32_t)(limit-start);
65     return
66         length==(int32_t)(otherLimit-otherStart) &&
67         0==u_memcmp(start, otherStart, length);
68 }
69 
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
71     if(remainingCapacity<2 && !resize(2, errorCode)) {
72         return FALSE;
73     }
74     if(lastCC<=cc || cc==0) {
75         limit[0]=U16_LEAD(c);
76         limit[1]=U16_TRAIL(c);
77         limit+=2;
78         lastCC=cc;
79         if(cc<=1) {
80             reorderStart=limit;
81         }
82     } else {
83         insert(c, cc);
84     }
85     remainingCapacity-=2;
86     return TRUE;
87 }
88 
append(const UChar * s,int32_t length,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)89 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
90                                uint8_t leadCC, uint8_t trailCC,
91                                UErrorCode &errorCode) {
92     if(length==0) {
93         return TRUE;
94     }
95     if(remainingCapacity<length && !resize(length, errorCode)) {
96         return FALSE;
97     }
98     remainingCapacity-=length;
99     if(lastCC<=leadCC || leadCC==0) {
100         if(trailCC<=1) {
101             reorderStart=limit+length;
102         } else if(leadCC<=1) {
103             reorderStart=limit+1;  // Ok if not a code point boundary.
104         }
105         const UChar *sLimit=s+length;
106         do { *limit++=*s++; } while(s!=sLimit);
107         lastCC=trailCC;
108     } else {
109         int32_t i=0;
110         UChar32 c;
111         U16_NEXT(s, i, length, c);
112         insert(c, leadCC);  // insert first code point
113         while(i<length) {
114             U16_NEXT(s, i, length, c);
115             if(i<length) {
116                 // s must be in NFD, otherwise we need to use getCC().
117                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
118             } else {
119                 leadCC=trailCC;
120             }
121             append(c, leadCC, errorCode);
122         }
123     }
124     return TRUE;
125 }
126 
appendZeroCC(UChar32 c,UErrorCode & errorCode)127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
128     int32_t cpLength=U16_LENGTH(c);
129     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
130         return FALSE;
131     }
132     remainingCapacity-=cpLength;
133     if(cpLength==1) {
134         *limit++=(UChar)c;
135     } else {
136         limit[0]=U16_LEAD(c);
137         limit[1]=U16_TRAIL(c);
138         limit+=2;
139     }
140     lastCC=0;
141     reorderStart=limit;
142     return TRUE;
143 }
144 
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
146     if(s==sLimit) {
147         return TRUE;
148     }
149     int32_t length=(int32_t)(sLimit-s);
150     if(remainingCapacity<length && !resize(length, errorCode)) {
151         return FALSE;
152     }
153     u_memcpy(limit, s, length);
154     limit+=length;
155     remainingCapacity-=length;
156     lastCC=0;
157     reorderStart=limit;
158     return TRUE;
159 }
160 
remove()161 void ReorderingBuffer::remove() {
162     reorderStart=limit=start;
163     remainingCapacity=str.getCapacity();
164     lastCC=0;
165 }
166 
removeSuffix(int32_t suffixLength)167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
168     if(suffixLength<(limit-start)) {
169         limit-=suffixLength;
170         remainingCapacity+=suffixLength;
171     } else {
172         limit=start;
173         remainingCapacity=str.getCapacity();
174     }
175     lastCC=0;
176     reorderStart=limit;
177 }
178 
resize(int32_t appendLength,UErrorCode & errorCode)179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
180     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
181     int32_t length=(int32_t)(limit-start);
182     str.releaseBuffer(length);
183     int32_t newCapacity=length+appendLength;
184     int32_t doubleCapacity=2*str.getCapacity();
185     if(newCapacity<doubleCapacity) {
186         newCapacity=doubleCapacity;
187     }
188     if(newCapacity<256) {
189         newCapacity=256;
190     }
191     start=str.getBuffer(newCapacity);
192     if(start==NULL) {
193         // getBuffer() already did str.setToBogus()
194         errorCode=U_MEMORY_ALLOCATION_ERROR;
195         return FALSE;
196     }
197     reorderStart=start+reorderStartIndex;
198     limit=start+length;
199     remainingCapacity=str.getCapacity()-length;
200     return TRUE;
201 }
202 
skipPrevious()203 void ReorderingBuffer::skipPrevious() {
204     codePointLimit=codePointStart;
205     UChar c=*--codePointStart;
206     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
207         --codePointStart;
208     }
209 }
210 
previousCC()211 uint8_t ReorderingBuffer::previousCC() {
212     codePointLimit=codePointStart;
213     if(reorderStart>=codePointStart) {
214         return 0;
215     }
216     UChar32 c=*--codePointStart;
217     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
218         return 0;
219     }
220 
221     UChar c2;
222     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
223         --codePointStart;
224         c=U16_GET_SUPPLEMENTARY(c2, c);
225     }
226     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
227 }
228 
229 // Inserts c somewhere before the last character.
230 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
232     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
233     // insert c at codePointLimit, after the character with prevCC<=cc
234     UChar *q=limit;
235     UChar *r=limit+=U16_LENGTH(c);
236     do {
237         *--r=*--q;
238     } while(codePointLimit!=q);
239     writeCodePoint(q, c);
240     if(cc<=1) {
241         reorderStart=r;
242     }
243 }
244 
245 // Normalizer2Impl --------------------------------------------------------- ***
246 
247 struct CanonIterData : public UMemory {
248     CanonIterData(UErrorCode &errorCode);
249     ~CanonIterData();
250     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
251     UTrie2 *trie;
252     UVector canonStartSets;  // contains UnicodeSet *
253 };
254 
~Normalizer2Impl()255 Normalizer2Impl::~Normalizer2Impl() {
256     delete fCanonIterData;
257 }
258 
259 void
init(const int32_t * inIndexes,const UTrie2 * inTrie,const uint16_t * inExtraData,const uint8_t * inSmallFCD)260 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
261                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
262     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
263     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
264 
265     minYesNo=inIndexes[IX_MIN_YES_NO];
266     minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
267     minNoNo=inIndexes[IX_MIN_NO_NO];
268     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
269     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
270 
271     normTrie=inTrie;
272 
273     maybeYesCompositions=inExtraData;
274     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
275 
276     smallFCD=inSmallFCD;
277 
278     // Build tccc180[].
279     // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
280     uint8_t bits=0;
281     for(UChar c=0; c<0x180; bits>>=1) {
282         if((c&0xff)==0) {
283             bits=smallFCD[c>>8];  // one byte per 0x100 code points
284         }
285         if(bits&1) {
286             for(int i=0; i<0x20; ++i, ++c) {
287                 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
288             }
289         } else {
290             uprv_memset(tccc180+c, 0, 0x20);
291             c+=0x20;
292         }
293     }
294 }
295 
getTrailCCFromCompYesAndZeroCC(const UChar * cpStart,const UChar * cpLimit) const296 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
297     UChar32 c;
298     if(cpStart==(cpLimit-1)) {
299         c=*cpStart;
300     } else {
301         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
302     }
303     uint16_t prevNorm16=getNorm16(c);
304     if(prevNorm16<=minYesNo) {
305         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
306     } else {
307         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
308     }
309 }
310 
311 namespace {
312 
313 class LcccContext {
314 public:
LcccContext(const Normalizer2Impl & ni,UnicodeSet & s)315     LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
316 
handleRange(UChar32 start,UChar32 end,uint16_t norm16)317     void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
318         if(impl.isAlgorithmicNoNo(norm16)) {
319             // Range of code points with same-norm16-value algorithmic decompositions.
320             // They might have different non-zero FCD16 values.
321             do {
322                 uint16_t fcd16=impl.getFCD16(start);
323                 if(fcd16>0xff) { set.add(start); }
324             } while(++start<=end);
325         } else {
326             uint16_t fcd16=impl.getFCD16(start);
327             if(fcd16>0xff) { set.add(start, end); }
328         }
329     }
330 
331 private:
332     const Normalizer2Impl &impl;
333     UnicodeSet &set;
334 };
335 
336 struct PropertyStartsContext {
PropertyStartsContext__anon7311a2d80111::PropertyStartsContext337     PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
338             : impl(ni), sa(adder) {}
339 
340     const Normalizer2Impl &impl;
341     const USetAdder *sa;
342 };
343 
344 }  // namespace
345 
346 U_CDECL_BEGIN
347 
348 static UBool U_CALLCONV
enumLcccRange(const void * context,UChar32 start,UChar32 end,uint32_t value)349 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
350     ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
351     return TRUE;
352 }
353 
354 static UBool U_CALLCONV
enumNorm16PropertyStartsRange(const void * context,UChar32 start,UChar32 end,uint32_t value)355 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
356     /* add the start code point to the USet */
357     const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
358     const USetAdder *sa=ctx->sa;
359     sa->add(sa->set, start);
360     if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) {
361         // Range of code points with same-norm16-value algorithmic decompositions.
362         // They might have different non-zero FCD16 values.
363         uint16_t prevFCD16=ctx->impl.getFCD16(start);
364         while(++start<=end) {
365             uint16_t fcd16=ctx->impl.getFCD16(start);
366             if(fcd16!=prevFCD16) {
367                 sa->add(sa->set, start);
368                 prevFCD16=fcd16;
369             }
370         }
371     }
372     return TRUE;
373 }
374 
375 static UBool U_CALLCONV
enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)376 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
377     /* add the start code point to the USet */
378     const USetAdder *sa=(const USetAdder *)context;
379     sa->add(sa->set, start);
380     return TRUE;
381 }
382 
383 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)384 segmentStarterMapper(const void * /*context*/, uint32_t value) {
385     return value&CANON_NOT_SEGMENT_STARTER;
386 }
387 
388 U_CDECL_END
389 
390 void
addLcccChars(UnicodeSet & set) const391 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
392     /* add the start code point of each same-value range of each trie */
393     LcccContext context(*this, set);
394     utrie2_enum(normTrie, NULL, enumLcccRange, &context);
395 }
396 
397 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const398 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
399     /* add the start code point of each same-value range of each trie */
400     PropertyStartsContext context(*this, sa);
401     utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
402 
403     /* add Hangul LV syllables and LV+1 because of skippables */
404     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
405         sa->add(sa->set, c);
406         sa->add(sa->set, c+1);
407     }
408     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
409 }
410 
411 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const412 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
413     /* add the start code point of each same-value range of the canonical iterator data trie */
414     if(ensureCanonIterData(errorCode)) {
415         // currently only used for the SEGMENT_STARTER property
416         utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
417     }
418 }
419 
420 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const421 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
422                                                 UChar32 minNeedDataCP,
423                                                 ReorderingBuffer *buffer,
424                                                 UErrorCode &errorCode) const {
425     // Make some effort to support NUL-terminated strings reasonably.
426     // Take the part of the fast quick check loop that does not look up
427     // data and check the first part of the string.
428     // After this prefix, determine the string length to simplify the rest
429     // of the code.
430     const UChar *prevSrc=src;
431     UChar c;
432     while((c=*src++)<minNeedDataCP && c!=0) {}
433     // Back out the last character for full processing.
434     // Copy this prefix.
435     if(--src!=prevSrc) {
436         if(buffer!=NULL) {
437             buffer->appendZeroCC(prevSrc, src, errorCode);
438         }
439     }
440     return src;
441 }
442 
443 UnicodeString &
decompose(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const444 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
445                            UErrorCode &errorCode) const {
446     if(U_FAILURE(errorCode)) {
447         dest.setToBogus();
448         return dest;
449     }
450     const UChar *sArray=src.getBuffer();
451     if(&dest==&src || sArray==NULL) {
452         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
453         dest.setToBogus();
454         return dest;
455     }
456     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
457     return dest;
458 }
459 
460 void
decompose(const UChar * src,const UChar * limit,UnicodeString & dest,int32_t destLengthEstimate,UErrorCode & errorCode) const461 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
462                            UnicodeString &dest,
463                            int32_t destLengthEstimate,
464                            UErrorCode &errorCode) const {
465     if(destLengthEstimate<0 && limit!=NULL) {
466         destLengthEstimate=(int32_t)(limit-src);
467     }
468     dest.remove();
469     ReorderingBuffer buffer(*this, dest);
470     if(buffer.init(destLengthEstimate, errorCode)) {
471         decompose(src, limit, &buffer, errorCode);
472     }
473 }
474 
475 // Dual functionality:
476 // buffer!=NULL: normalize
477 // buffer==NULL: isNormalized/spanQuickCheckYes
478 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const479 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
480                            ReorderingBuffer *buffer,
481                            UErrorCode &errorCode) const {
482     UChar32 minNoCP=minDecompNoCP;
483     if(limit==NULL) {
484         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
485         if(U_FAILURE(errorCode)) {
486             return src;
487         }
488         limit=u_strchr(src, 0);
489     }
490 
491     const UChar *prevSrc;
492     UChar32 c=0;
493     uint16_t norm16=0;
494 
495     // only for quick check
496     const UChar *prevBoundary=src;
497     uint8_t prevCC=0;
498 
499     for(;;) {
500         // count code units below the minimum or with irrelevant data for the quick check
501         for(prevSrc=src; src!=limit;) {
502             if( (c=*src)<minNoCP ||
503                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
504             ) {
505                 ++src;
506             } else if(!U16_IS_SURROGATE(c)) {
507                 break;
508             } else {
509                 UChar c2;
510                 if(U16_IS_SURROGATE_LEAD(c)) {
511                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
512                         c=U16_GET_SUPPLEMENTARY(c, c2);
513                     }
514                 } else /* trail surrogate */ {
515                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
516                         --src;
517                         c=U16_GET_SUPPLEMENTARY(c2, c);
518                     }
519                 }
520                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
521                     src+=U16_LENGTH(c);
522                 } else {
523                     break;
524                 }
525             }
526         }
527         // copy these code units all at once
528         if(src!=prevSrc) {
529             if(buffer!=NULL) {
530                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
531                     break;
532                 }
533             } else {
534                 prevCC=0;
535                 prevBoundary=src;
536             }
537         }
538         if(src==limit) {
539             break;
540         }
541 
542         // Check one above-minimum, relevant code point.
543         src+=U16_LENGTH(c);
544         if(buffer!=NULL) {
545             if(!decompose(c, norm16, *buffer, errorCode)) {
546                 break;
547             }
548         } else {
549             if(isDecompYes(norm16)) {
550                 uint8_t cc=getCCFromYesOrMaybe(norm16);
551                 if(prevCC<=cc || cc==0) {
552                     prevCC=cc;
553                     if(cc<=1) {
554                         prevBoundary=src;
555                     }
556                     continue;
557                 }
558             }
559             return prevBoundary;  // "no" or cc out of order
560         }
561     }
562     return src;
563 }
564 
565 // Decompose a short piece of text which is likely to contain characters that
566 // fail the quick check loop and/or where the quick check loop's overhead
567 // is unlikely to be amortized.
568 // Called by the compose() and makeFCD() implementations.
decomposeShort(const UChar * src,const UChar * limit,ReorderingBuffer & buffer,UErrorCode & errorCode) const569 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
570                                       ReorderingBuffer &buffer,
571                                       UErrorCode &errorCode) const {
572     while(src<limit) {
573         UChar32 c;
574         uint16_t norm16;
575         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
576         if(!decompose(c, norm16, buffer, errorCode)) {
577             return FALSE;
578         }
579     }
580     return TRUE;
581 }
582 
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const583 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
584                                  ReorderingBuffer &buffer,
585                                  UErrorCode &errorCode) const {
586     // Only loops for 1:1 algorithmic mappings.
587     for(;;) {
588         // get the decomposition and the lead and trail cc's
589         if(isDecompYes(norm16)) {
590             // c does not decompose
591             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
592         } else if(isHangul(norm16)) {
593             // Hangul syllable: decompose algorithmically
594             UChar jamos[3];
595             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
596         } else if(isDecompNoAlgorithmic(norm16)) {
597             c=mapAlgorithmic(c, norm16);
598             norm16=getNorm16(c);
599         } else {
600             // c decomposes, get everything from the variable-length extra data
601             const uint16_t *mapping=getMapping(norm16);
602             uint16_t firstUnit=*mapping;
603             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
604             uint8_t leadCC, trailCC;
605             trailCC=(uint8_t)(firstUnit>>8);
606             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
607                 leadCC=(uint8_t)(*(mapping-1)>>8);
608             } else {
609                 leadCC=0;
610             }
611             return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
612         }
613     }
614 }
615 
616 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const617 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
618     const UChar *decomp=NULL;
619     uint16_t norm16;
620     for(;;) {
621         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
622             // c does not decompose
623             return decomp;
624         } else if(isHangul(norm16)) {
625             // Hangul syllable: decompose algorithmically
626             length=Hangul::decompose(c, buffer);
627             return buffer;
628         } else if(isDecompNoAlgorithmic(norm16)) {
629             c=mapAlgorithmic(c, norm16);
630             decomp=buffer;
631             length=0;
632             U16_APPEND_UNSAFE(buffer, length, c);
633         } else {
634             // c decomposes, get everything from the variable-length extra data
635             const uint16_t *mapping=getMapping(norm16);
636             length=*mapping&MAPPING_LENGTH_MASK;
637             return (const UChar *)mapping+1;
638         }
639     }
640 }
641 
642 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
643 // so that a raw mapping fits that consists of one unit ("rm0")
644 // plus all but the first two code units of the normal mapping.
645 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
646 const UChar *
getRawDecomposition(UChar32 c,UChar buffer[30],int32_t & length) const647 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
648     // We do not loop in this method because an algorithmic mapping itself
649     // becomes a final result rather than having to be decomposed recursively.
650     uint16_t norm16;
651     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
652         // c does not decompose
653         return NULL;
654     } else if(isHangul(norm16)) {
655         // Hangul syllable: decompose algorithmically
656         Hangul::getRawDecomposition(c, buffer);
657         length=2;
658         return buffer;
659     } else if(isDecompNoAlgorithmic(norm16)) {
660         c=mapAlgorithmic(c, norm16);
661         length=0;
662         U16_APPEND_UNSAFE(buffer, length, c);
663         return buffer;
664     } else {
665         // c decomposes, get everything from the variable-length extra data
666         const uint16_t *mapping=getMapping(norm16);
667         uint16_t firstUnit=*mapping;
668         int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
669         if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
670             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
671             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
672             const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
673             uint16_t rm0=*rawMapping;
674             if(rm0<=MAPPING_LENGTH_MASK) {
675                 length=rm0;
676                 return (const UChar *)rawMapping-rm0;
677             } else {
678                 // Copy the normal mapping and replace its first two code units with rm0.
679                 buffer[0]=(UChar)rm0;
680                 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
681                 length=mLength-1;
682                 return buffer;
683             }
684         } else {
685             length=mLength;
686             return (const UChar *)mapping+1;
687         }
688     }
689 }
690 
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const691 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
692                                          UBool doDecompose,
693                                          UnicodeString &safeMiddle,
694                                          ReorderingBuffer &buffer,
695                                          UErrorCode &errorCode) const {
696     buffer.copyReorderableSuffixTo(safeMiddle);
697     if(doDecompose) {
698         decompose(src, limit, &buffer, errorCode);
699         return;
700     }
701     // Just merge the strings at the boundary.
702     ForwardUTrie2StringIterator iter(normTrie, src, limit);
703     uint8_t firstCC, prevCC, cc;
704     firstCC=prevCC=cc=getCC(iter.next16());
705     while(cc!=0) {
706         prevCC=cc;
707         cc=getCC(iter.next16());
708     };
709     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
710         limit=u_strchr(iter.codePointStart, 0);
711     }
712 
713     if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
714         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
715     }
716 }
717 
718 // Note: hasDecompBoundary() could be implemented as aliases to
719 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
720 // at the cost of building the FCD trie for a decomposition normalizer.
hasDecompBoundary(UChar32 c,UBool before) const721 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
722     for(;;) {
723         if(c<minDecompNoCP) {
724             return TRUE;
725         }
726         uint16_t norm16=getNorm16(c);
727         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
728             return TRUE;
729         } else if(norm16>MIN_NORMAL_MAYBE_YES) {
730             return FALSE;  // ccc!=0
731         } else if(isDecompNoAlgorithmic(norm16)) {
732             c=mapAlgorithmic(c, norm16);
733         } else {
734             // c decomposes, get everything from the variable-length extra data
735             const uint16_t *mapping=getMapping(norm16);
736             uint16_t firstUnit=*mapping;
737             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
738                 return FALSE;
739             }
740             if(!before) {
741                 // decomp after-boundary: same as hasFCDBoundaryAfter(),
742                 // fcd16<=1 || trailCC==0
743                 if(firstUnit>0x1ff) {
744                     return FALSE;  // trailCC>1
745                 }
746                 if(firstUnit<=0xff) {
747                     return TRUE;  // trailCC==0
748                 }
749                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
750             }
751             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
752             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
753         }
754     }
755 }
756 
757 /*
758  * Finds the recomposition result for
759  * a forward-combining "lead" character,
760  * specified with a pointer to its compositions list,
761  * and a backward-combining "trail" character.
762  *
763  * If the lead and trail characters combine, then this function returns
764  * the following "compositeAndFwd" value:
765  * Bits 21..1  composite character
766  * Bit      0  set if the composite is a forward-combining starter
767  * otherwise it returns -1.
768  *
769  * The compositions list has (trail, compositeAndFwd) pair entries,
770  * encoded as either pairs or triples of 16-bit units.
771  * The last entry has the high bit of its first unit set.
772  *
773  * The list is sorted by ascending trail characters (there are no duplicates).
774  * A linear search is used.
775  *
776  * See normalizer2impl.h for a more detailed description
777  * of the compositions list format.
778  */
combine(const uint16_t * list,UChar32 trail)779 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
780     uint16_t key1, firstUnit;
781     if(trail<COMP_1_TRAIL_LIMIT) {
782         // trail character is 0..33FF
783         // result entry may have 2 or 3 units
784         key1=(uint16_t)(trail<<1);
785         while(key1>(firstUnit=*list)) {
786             list+=2+(firstUnit&COMP_1_TRIPLE);
787         }
788         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
789             if(firstUnit&COMP_1_TRIPLE) {
790                 return ((int32_t)list[1]<<16)|list[2];
791             } else {
792                 return list[1];
793             }
794         }
795     } else {
796         // trail character is 3400..10FFFF
797         // result entry has 3 units
798         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
799                         (((trail>>COMP_1_TRAIL_SHIFT))&
800                           ~COMP_1_TRIPLE));
801         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
802         uint16_t secondUnit;
803         for(;;) {
804             if(key1>(firstUnit=*list)) {
805                 list+=2+(firstUnit&COMP_1_TRIPLE);
806             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
807                 if(key2>(secondUnit=list[1])) {
808                     if(firstUnit&COMP_1_LAST_TUPLE) {
809                         break;
810                     } else {
811                         list+=3;
812                     }
813                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
814                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
815                 } else {
816                     break;
817                 }
818             } else {
819                 break;
820             }
821         }
822     }
823     return -1;
824 }
825 
826 /**
827   * @param list some character's compositions list
828   * @param set recursively receives the composites from these compositions
829   */
addComposites(const uint16_t * list,UnicodeSet & set) const830 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
831     uint16_t firstUnit;
832     int32_t compositeAndFwd;
833     do {
834         firstUnit=*list;
835         if((firstUnit&COMP_1_TRIPLE)==0) {
836             compositeAndFwd=list[1];
837             list+=2;
838         } else {
839             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
840             list+=3;
841         }
842         UChar32 composite=compositeAndFwd>>1;
843         if((compositeAndFwd&1)!=0) {
844             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
845         }
846         set.add(composite);
847     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
848 }
849 
850 /*
851  * Recomposes the buffer text starting at recomposeStartIndex
852  * (which is in NFD - decomposed and canonically ordered),
853  * and truncates the buffer contents.
854  *
855  * Note that recomposition never lengthens the text:
856  * Any character consists of either one or two code units;
857  * a composition may contain at most one more code unit than the original starter,
858  * while the combining mark that is removed has at least one code unit.
859  */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const860 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
861                                 UBool onlyContiguous) const {
862     UChar *p=buffer.getStart()+recomposeStartIndex;
863     UChar *limit=buffer.getLimit();
864     if(p==limit) {
865         return;
866     }
867 
868     UChar *starter, *pRemove, *q, *r;
869     const uint16_t *compositionsList;
870     UChar32 c, compositeAndFwd;
871     uint16_t norm16;
872     uint8_t cc, prevCC;
873     UBool starterIsSupplementary;
874 
875     // Some of the following variables are not used until we have a forward-combining starter
876     // and are only initialized now to avoid compiler warnings.
877     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
878     starter=NULL;
879     starterIsSupplementary=FALSE;
880     prevCC=0;
881 
882     for(;;) {
883         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
884         cc=getCCFromYesOrMaybe(norm16);
885         if( // this character combines backward and
886             isMaybe(norm16) &&
887             // we have seen a starter that combines forward and
888             compositionsList!=NULL &&
889             // the backward-combining character is not blocked
890             (prevCC<cc || prevCC==0)
891         ) {
892             if(isJamoVT(norm16)) {
893                 // c is a Jamo V/T, see if we can compose it with the previous character.
894                 if(c<Hangul::JAMO_T_BASE) {
895                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
896                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
897                     if(prev<Hangul::JAMO_L_COUNT) {
898                         pRemove=p-1;
899                         UChar syllable=(UChar)
900                             (Hangul::HANGUL_BASE+
901                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
902                              Hangul::JAMO_T_COUNT);
903                         UChar t;
904                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
905                             ++p;
906                             syllable+=t;  // The next character was a Jamo T.
907                         }
908                         *starter=syllable;
909                         // remove the Jamo V/T
910                         q=pRemove;
911                         r=p;
912                         while(r<limit) {
913                             *q++=*r++;
914                         }
915                         limit=q;
916                         p=pRemove;
917                     }
918                 }
919                 /*
920                  * No "else" for Jamo T:
921                  * Since the input is in NFD, there are no Hangul LV syllables that
922                  * a Jamo T could combine with.
923                  * All Jamo Ts are combined above when handling Jamo Vs.
924                  */
925                 if(p==limit) {
926                     break;
927                 }
928                 compositionsList=NULL;
929                 continue;
930             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
931                 // The starter and the combining mark (c) do combine.
932                 UChar32 composite=compositeAndFwd>>1;
933 
934                 // Replace the starter with the composite, remove the combining mark.
935                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
936                 if(starterIsSupplementary) {
937                     if(U_IS_SUPPLEMENTARY(composite)) {
938                         // both are supplementary
939                         starter[0]=U16_LEAD(composite);
940                         starter[1]=U16_TRAIL(composite);
941                     } else {
942                         *starter=(UChar)composite;
943                         // The composite is shorter than the starter,
944                         // move the intermediate characters forward one.
945                         starterIsSupplementary=FALSE;
946                         q=starter+1;
947                         r=q+1;
948                         while(r<pRemove) {
949                             *q++=*r++;
950                         }
951                         --pRemove;
952                     }
953                 } else if(U_IS_SUPPLEMENTARY(composite)) {
954                     // The composite is longer than the starter,
955                     // move the intermediate characters back one.
956                     starterIsSupplementary=TRUE;
957                     ++starter;  // temporarily increment for the loop boundary
958                     q=pRemove;
959                     r=++pRemove;
960                     while(starter<q) {
961                         *--r=*--q;
962                     }
963                     *starter=U16_TRAIL(composite);
964                     *--starter=U16_LEAD(composite);  // undo the temporary increment
965                 } else {
966                     // both are on the BMP
967                     *starter=(UChar)composite;
968                 }
969 
970                 /* remove the combining mark by moving the following text over it */
971                 if(pRemove<p) {
972                     q=pRemove;
973                     r=p;
974                     while(r<limit) {
975                         *q++=*r++;
976                     }
977                     limit=q;
978                     p=pRemove;
979                 }
980                 // Keep prevCC because we removed the combining mark.
981 
982                 if(p==limit) {
983                     break;
984                 }
985                 // Is the composite a starter that combines forward?
986                 if(compositeAndFwd&1) {
987                     compositionsList=
988                         getCompositionsListForComposite(getNorm16(composite));
989                 } else {
990                     compositionsList=NULL;
991                 }
992 
993                 // We combined; continue with looking for compositions.
994                 continue;
995             }
996         }
997 
998         // no combination this time
999         prevCC=cc;
1000         if(p==limit) {
1001             break;
1002         }
1003 
1004         // If c did not combine, then check if it is a starter.
1005         if(cc==0) {
1006             // Found a new starter.
1007             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1008                 // It may combine with something, prepare for it.
1009                 if(U_IS_BMP(c)) {
1010                     starterIsSupplementary=FALSE;
1011                     starter=p-1;
1012                 } else {
1013                     starterIsSupplementary=TRUE;
1014                     starter=p-2;
1015                 }
1016             }
1017         } else if(onlyContiguous) {
1018             // FCC: no discontiguous compositions; any intervening character blocks.
1019             compositionsList=NULL;
1020         }
1021     }
1022     buffer.setReorderingLimit(limit);
1023 }
1024 
1025 UChar32
composePair(UChar32 a,UChar32 b) const1026 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1027     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
1028     const uint16_t *list;
1029     if(isInert(norm16)) {
1030         return U_SENTINEL;
1031     } else if(norm16<minYesNoMappingsOnly) {
1032         if(isJamoL(norm16)) {
1033             b-=Hangul::JAMO_V_BASE;
1034             if(0<=b && b<Hangul::JAMO_V_COUNT) {
1035                 return
1036                     (Hangul::HANGUL_BASE+
1037                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1038                      Hangul::JAMO_T_COUNT);
1039             } else {
1040                 return U_SENTINEL;
1041             }
1042         } else if(isHangul(norm16)) {
1043             b-=Hangul::JAMO_T_BASE;
1044             if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
1045                 return a+b;
1046             } else {
1047                 return U_SENTINEL;
1048             }
1049         } else {
1050             // 'a' has a compositions list in extraData
1051             list=extraData+norm16;
1052             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
1053                 list+=  // mapping pointer
1054                     1+  // +1 to skip the first unit with the mapping lenth
1055                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
1056             }
1057         }
1058     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1059         return U_SENTINEL;
1060     } else {
1061         list=maybeYesCompositions+norm16-minMaybeYes;
1062     }
1063     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
1064         return U_SENTINEL;
1065     }
1066 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1067     return combine(list, b)>>1;
1068 #else
1069     int32_t compositeAndFwd=combine(list, b);
1070     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1071 #endif
1072 }
1073 
1074 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1075 // doCompose: normalize
1076 // !doCompose: isNormalized (buffer must be empty and initialized)
1077 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const1078 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1079                          UBool onlyContiguous,
1080                          UBool doCompose,
1081                          ReorderingBuffer &buffer,
1082                          UErrorCode &errorCode) const {
1083     /*
1084      * prevBoundary points to the last character before the current one
1085      * that has a composition boundary before it with ccc==0 and quick check "yes".
1086      * Keeping track of prevBoundary saves us looking for a composition boundary
1087      * when we find a "no" or "maybe".
1088      *
1089      * When we back out from prevSrc back to prevBoundary,
1090      * then we also remove those same characters (which had been simply copied
1091      * or canonically-order-inserted) from the ReorderingBuffer.
1092      * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1093      * must correspond 1:1 to destination units at the end of the destination buffer.
1094      */
1095     const UChar *prevBoundary=src;
1096     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1097     if(limit==NULL) {
1098         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1099                                            doCompose ? &buffer : NULL,
1100                                            errorCode);
1101         if(U_FAILURE(errorCode)) {
1102             return FALSE;
1103         }
1104         if(prevBoundary<src) {
1105             // Set prevBoundary to the last character in the prefix.
1106             prevBoundary=src-1;
1107         }
1108         limit=u_strchr(src, 0);
1109     }
1110 
1111     const UChar *prevSrc;
1112     UChar32 c=0;
1113     uint16_t norm16=0;
1114 
1115     // only for isNormalized
1116     uint8_t prevCC=0;
1117 
1118     for(;;) {
1119         // count code units below the minimum or with irrelevant data for the quick check
1120         for(prevSrc=src; src!=limit;) {
1121             if( (c=*src)<minNoMaybeCP ||
1122                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1123             ) {
1124                 ++src;
1125             } else if(!U16_IS_SURROGATE(c)) {
1126                 break;
1127             } else {
1128                 UChar c2;
1129                 if(U16_IS_SURROGATE_LEAD(c)) {
1130                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1131                         c=U16_GET_SUPPLEMENTARY(c, c2);
1132                     }
1133                 } else /* trail surrogate */ {
1134                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1135                         --src;
1136                         c=U16_GET_SUPPLEMENTARY(c2, c);
1137                     }
1138                 }
1139                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1140                     src+=U16_LENGTH(c);
1141                 } else {
1142                     break;
1143                 }
1144             }
1145         }
1146         // copy these code units all at once
1147         if(src!=prevSrc) {
1148             if(doCompose) {
1149                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
1150                     break;
1151                 }
1152             } else {
1153                 prevCC=0;
1154             }
1155             if(src==limit) {
1156                 break;
1157             }
1158             // Set prevBoundary to the last character in the quick check loop.
1159             prevBoundary=src-1;
1160             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1161                 U16_IS_LEAD(*(prevBoundary-1))
1162             ) {
1163                 --prevBoundary;
1164             }
1165             // The start of the current character (c).
1166             prevSrc=src;
1167         } else if(src==limit) {
1168             break;
1169         }
1170 
1171         src+=U16_LENGTH(c);
1172         /*
1173          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1174          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1175          * or has ccc!=0.
1176          * Check for Jamo V/T, then for regular characters.
1177          * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1178          */
1179         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1180             UChar prev=*(prevSrc-1);
1181             UBool needToDecompose=FALSE;
1182             if(c<Hangul::JAMO_T_BASE) {
1183                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1184                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1185                 if(prev<Hangul::JAMO_L_COUNT) {
1186                     if(!doCompose) {
1187                         return FALSE;
1188                     }
1189                     UChar syllable=(UChar)
1190                         (Hangul::HANGUL_BASE+
1191                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1192                          Hangul::JAMO_T_COUNT);
1193                     UChar t;
1194                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1195                         ++src;
1196                         syllable+=t;  // The next character was a Jamo T.
1197                         prevBoundary=src;
1198                         buffer.setLastChar(syllable);
1199                         continue;
1200                     }
1201                     // If we see L+V+x where x!=T then we drop to the slow path,
1202                     // decompose and recompose.
1203                     // This is to deal with NFKC finding normal L and V but a
1204                     // compatibility variant of a T. We need to either fully compose that
1205                     // combination here (which would complicate the code and may not work
1206                     // with strange custom data) or use the slow path -- or else our replacing
1207                     // two input characters (L+V) with one output character (LV syllable)
1208                     // would violate the invariant that [prevBoundary..prevSrc[ has the same
1209                     // length as what we appended to the buffer since prevBoundary.
1210                     needToDecompose=TRUE;
1211                 }
1212             } else if(Hangul::isHangulWithoutJamoT(prev)) {
1213                 // c is a Jamo Trailing consonant,
1214                 // compose with previous Hangul LV that does not contain a Jamo T.
1215                 if(!doCompose) {
1216                     return FALSE;
1217                 }
1218                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1219                 prevBoundary=src;
1220                 continue;
1221             }
1222             if(!needToDecompose) {
1223                 // The Jamo V/T did not compose into a Hangul syllable.
1224                 if(doCompose) {
1225                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1226                         break;
1227                     }
1228                 } else {
1229                     prevCC=0;
1230                 }
1231                 continue;
1232             }
1233         }
1234         /*
1235          * Source buffer pointers:
1236          *
1237          *  all done      quick check   current char  not yet
1238          *                "yes" but     (c)           processed
1239          *                may combine
1240          *                forward
1241          * [-------------[-------------[-------------[-------------[
1242          * |             |             |             |             |
1243          * orig. src     prevBoundary  prevSrc       src           limit
1244          *
1245          *
1246          * Destination buffer pointers inside the ReorderingBuffer:
1247          *
1248          *  all done      might take    not filled yet
1249          *                characters for
1250          *                reordering
1251          * [-------------[-------------[-------------[
1252          * |             |             |             |
1253          * start         reorderStart  limit         |
1254          *                             +remainingCap.+
1255          */
1256         if(norm16>=MIN_YES_YES_WITH_CC) {
1257             uint8_t cc=(uint8_t)norm16;  // cc!=0
1258             if( onlyContiguous &&  // FCC
1259                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1260                 prevBoundary<prevSrc &&
1261                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1262                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1263                 // passed the quick check "yes && ccc==0" test.
1264                 // Check whether the last character was a "yesYes" or a "yesNo".
1265                 // If a "yesNo", then we get its trailing ccc from its
1266                 // mapping and check for canonical order.
1267                 // All other cases are ok.
1268                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1269             ) {
1270                 // Fails FCD test, need to decompose and contiguously recompose.
1271                 if(!doCompose) {
1272                     return FALSE;
1273                 }
1274             } else if(doCompose) {
1275                 if(!buffer.append(c, cc, errorCode)) {
1276                     break;
1277                 }
1278                 continue;
1279             } else if(prevCC<=cc) {
1280                 prevCC=cc;
1281                 continue;
1282             } else {
1283                 return FALSE;
1284             }
1285         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1286             return FALSE;
1287         }
1288 
1289         /*
1290          * Find appropriate boundaries around this character,
1291          * decompose the source text from between the boundaries,
1292          * and recompose it.
1293          *
1294          * We may need to remove the last few characters from the ReorderingBuffer
1295          * to account for source text that was copied or appended
1296          * but needs to take part in the recomposition.
1297          */
1298 
1299         /*
1300          * Find the last composition boundary in [prevBoundary..src[.
1301          * It is either the decomposition of the current character (at prevSrc),
1302          * or prevBoundary.
1303          */
1304         if(hasCompBoundaryBefore(c, norm16)) {
1305             prevBoundary=prevSrc;
1306         } else if(doCompose) {
1307             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1308         }
1309 
1310         // Find the next composition boundary in [src..limit[ -
1311         // modifies src to point to the next starter.
1312         src=(UChar *)findNextCompBoundary(src, limit);
1313 
1314         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1315         int32_t recomposeStartIndex=buffer.length();
1316         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1317             break;
1318         }
1319         recompose(buffer, recomposeStartIndex, onlyContiguous);
1320         if(!doCompose) {
1321             if(!buffer.equals(prevBoundary, src)) {
1322                 return FALSE;
1323             }
1324             buffer.remove();
1325             prevCC=0;
1326         }
1327 
1328         // Move to the next starter. We never need to look back before this point again.
1329         prevBoundary=src;
1330     }
1331     return TRUE;
1332 }
1333 
1334 // Very similar to compose(): Make the same changes in both places if relevant.
1335 // pQCResult==NULL: spanQuickCheckYes
1336 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1337 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1338 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1339                                    UBool onlyContiguous,
1340                                    UNormalizationCheckResult *pQCResult) const {
1341     /*
1342      * prevBoundary points to the last character before the current one
1343      * that has a composition boundary before it with ccc==0 and quick check "yes".
1344      */
1345     const UChar *prevBoundary=src;
1346     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1347     if(limit==NULL) {
1348         UErrorCode errorCode=U_ZERO_ERROR;
1349         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1350         if(prevBoundary<src) {
1351             // Set prevBoundary to the last character in the prefix.
1352             prevBoundary=src-1;
1353         }
1354         limit=u_strchr(src, 0);
1355     }
1356 
1357     const UChar *prevSrc;
1358     UChar32 c=0;
1359     uint16_t norm16=0;
1360     uint8_t prevCC=0;
1361 
1362     for(;;) {
1363         // count code units below the minimum or with irrelevant data for the quick check
1364         for(prevSrc=src;;) {
1365             if(src==limit) {
1366                 return src;
1367             }
1368             if( (c=*src)<minNoMaybeCP ||
1369                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1370             ) {
1371                 ++src;
1372             } else if(!U16_IS_SURROGATE(c)) {
1373                 break;
1374             } else {
1375                 UChar c2;
1376                 if(U16_IS_SURROGATE_LEAD(c)) {
1377                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1378                         c=U16_GET_SUPPLEMENTARY(c, c2);
1379                     }
1380                 } else /* trail surrogate */ {
1381                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1382                         --src;
1383                         c=U16_GET_SUPPLEMENTARY(c2, c);
1384                     }
1385                 }
1386                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1387                     src+=U16_LENGTH(c);
1388                 } else {
1389                     break;
1390                 }
1391             }
1392         }
1393         if(src!=prevSrc) {
1394             // Set prevBoundary to the last character in the quick check loop.
1395             prevBoundary=src-1;
1396             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1397                 U16_IS_LEAD(*(prevBoundary-1))
1398             ) {
1399                 --prevBoundary;
1400             }
1401             prevCC=0;
1402             // The start of the current character (c).
1403             prevSrc=src;
1404         }
1405 
1406         src+=U16_LENGTH(c);
1407         /*
1408          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1409          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1410          * or has ccc!=0.
1411          */
1412         if(isMaybeOrNonZeroCC(norm16)) {
1413             uint8_t cc=getCCFromYesOrMaybe(norm16);
1414             if( onlyContiguous &&  // FCC
1415                 cc!=0 &&
1416                 prevCC==0 &&
1417                 prevBoundary<prevSrc &&
1418                 // prevCC==0 && prevBoundary<prevSrc tell us that
1419                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1420                 // passed the quick check "yes && ccc==0" test.
1421                 // Check whether the last character was a "yesYes" or a "yesNo".
1422                 // If a "yesNo", then we get its trailing ccc from its
1423                 // mapping and check for canonical order.
1424                 // All other cases are ok.
1425                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1426             ) {
1427                 // Fails FCD test.
1428             } else if(prevCC<=cc || cc==0) {
1429                 prevCC=cc;
1430                 if(norm16<MIN_YES_YES_WITH_CC) {
1431                     if(pQCResult!=NULL) {
1432                         *pQCResult=UNORM_MAYBE;
1433                     } else {
1434                         return prevBoundary;
1435                     }
1436                 }
1437                 continue;
1438             }
1439         }
1440         if(pQCResult!=NULL) {
1441             *pQCResult=UNORM_NO;
1442         }
1443         return prevBoundary;
1444     }
1445 }
1446 
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1447 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1448                                        UBool doCompose,
1449                                        UBool onlyContiguous,
1450                                        UnicodeString &safeMiddle,
1451                                        ReorderingBuffer &buffer,
1452                                        UErrorCode &errorCode) const {
1453     if(!buffer.isEmpty()) {
1454         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1455         if(src!=firstStarterInSrc) {
1456             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1457                                                                     buffer.getLimit());
1458             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1459             UnicodeString middle(lastStarterInDest, destSuffixLength);
1460             buffer.removeSuffix(destSuffixLength);
1461             safeMiddle=middle;
1462             middle.append(src, (int32_t)(firstStarterInSrc-src));
1463             const UChar *middleStart=middle.getBuffer();
1464             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1465                     TRUE, buffer, errorCode);
1466             if(U_FAILURE(errorCode)) {
1467                 return;
1468             }
1469             src=firstStarterInSrc;
1470         }
1471     }
1472     if(doCompose) {
1473         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1474     } else {
1475         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1476             limit=u_strchr(src, 0);
1477         }
1478         buffer.appendZeroCC(src, limit, errorCode);
1479     }
1480 }
1481 
1482 /**
1483  * Does c have a composition boundary before it?
1484  * True if its decomposition begins with a character that has
1485  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1486  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1487  * (isCompYesAndZeroCC()) so we need not decompose.
1488  */
hasCompBoundaryBefore(UChar32 c,uint16_t norm16) const1489 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1490     for(;;) {
1491         if(isCompYesAndZeroCC(norm16)) {
1492             return TRUE;
1493         } else if(isMaybeOrNonZeroCC(norm16)) {
1494             return FALSE;
1495         } else if(isDecompNoAlgorithmic(norm16)) {
1496             c=mapAlgorithmic(c, norm16);
1497             norm16=getNorm16(c);
1498         } else {
1499             // c decomposes, get everything from the variable-length extra data
1500             const uint16_t *mapping=getMapping(norm16);
1501             uint16_t firstUnit=*mapping;
1502             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1503                 return FALSE;
1504             }
1505             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
1506                 return FALSE;  // non-zero leadCC
1507             }
1508             int32_t i=1;  // skip over the firstUnit
1509             UChar32 c;
1510             U16_NEXT_UNSAFE(mapping, i, c);
1511             return isCompYesAndZeroCC(getNorm16(c));
1512         }
1513     }
1514 }
1515 
hasCompBoundaryAfter(UChar32 c,UBool onlyContiguous,UBool testInert) const1516 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1517     for(;;) {
1518         uint16_t norm16=getNorm16(c);
1519         if(isInert(norm16)) {
1520             return TRUE;
1521         } else if(norm16<=minYesNo) {
1522             // Hangul: norm16==minYesNo
1523             // Hangul LVT has a boundary after it.
1524             // Hangul LV and non-inert yesYes characters combine forward.
1525             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1526         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1527             return FALSE;
1528         } else if(isDecompNoAlgorithmic(norm16)) {
1529             c=mapAlgorithmic(c, norm16);
1530         } else {
1531             // c decomposes, get everything from the variable-length extra data.
1532             // If testInert, then c must be a yesNo character which has lccc=0,
1533             // otherwise it could be a noNo.
1534             const uint16_t *mapping=getMapping(norm16);
1535             uint16_t firstUnit=*mapping;
1536             // TRUE if
1537             //   not MAPPING_NO_COMP_BOUNDARY_AFTER
1538             //     (which is set if
1539             //       c is not deleted, and
1540             //       it and its decomposition do not combine forward, and it has a starter)
1541             //   and if FCC then trailCC<=1
1542             return
1543                 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1544                 (!onlyContiguous || firstUnit<=0x1ff);
1545         }
1546     }
1547 }
1548 
findPreviousCompBoundary(const UChar * start,const UChar * p) const1549 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1550     BackwardUTrie2StringIterator iter(normTrie, start, p);
1551     uint16_t norm16;
1552     do {
1553         norm16=iter.previous16();
1554     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1555     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1556     // but that's probably not worth the extra cost.
1557     return iter.codePointStart;
1558 }
1559 
findNextCompBoundary(const UChar * p,const UChar * limit) const1560 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1561     ForwardUTrie2StringIterator iter(normTrie, p, limit);
1562     uint16_t norm16;
1563     do {
1564         norm16=iter.next16();
1565     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1566     return iter.codePointStart;
1567 }
1568 
1569 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
1570 // still had getFCDTrie() which built and cached an FCD trie.
1571 // That provided faster access to FCD data than getFCD16FromNormData()
1572 // but required synchronization and consumed some 10kB of heap memory
1573 // in any process that uses FCD (e.g., via collation).
1574 // tccc180[] and smallFCD[] are intended to help with any loss of performance,
1575 // at least for Latin & CJK.
1576 
1577 // Gets the FCD value from the regular normalization data.
getFCD16FromNormData(UChar32 c) const1578 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
1579     // Only loops for 1:1 algorithmic mappings.
1580     for(;;) {
1581         uint16_t norm16=getNorm16(c);
1582         if(norm16<=minYesNo) {
1583             // no decomposition or Hangul syllable, all zeros
1584             return 0;
1585         } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
1586             // combining mark
1587             norm16&=0xff;
1588             return norm16|(norm16<<8);
1589         } else if(norm16>=minMaybeYes) {
1590             return 0;
1591         } else if(isDecompNoAlgorithmic(norm16)) {
1592             c=mapAlgorithmic(c, norm16);
1593         } else {
1594             // c decomposes, get everything from the variable-length extra data
1595             const uint16_t *mapping=getMapping(norm16);
1596             uint16_t firstUnit=*mapping;
1597             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1598                 // A character that is deleted (maps to an empty string) must
1599                 // get the worst-case lccc and tccc values because arbitrary
1600                 // characters on both sides will become adjacent.
1601                 return 0x1ff;
1602             } else {
1603                 norm16=firstUnit>>8;  // tccc
1604                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1605                     norm16|=*(mapping-1)&0xff00;  // lccc
1606                 }
1607                 return norm16;
1608             }
1609         }
1610     }
1611 }
1612 
1613 // Dual functionality:
1614 // buffer!=NULL: normalize
1615 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1616 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const1617 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1618                          ReorderingBuffer *buffer,
1619                          UErrorCode &errorCode) const {
1620     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1621     // Similar to the prevBoundary in the compose() implementation.
1622     const UChar *prevBoundary=src;
1623     int32_t prevFCD16=0;
1624     if(limit==NULL) {
1625         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1626         if(U_FAILURE(errorCode)) {
1627             return src;
1628         }
1629         if(prevBoundary<src) {
1630             prevBoundary=src;
1631             // We know that the previous character's lccc==0.
1632             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1633             prevFCD16=getFCD16(*(src-1));
1634             if(prevFCD16>1) {
1635                 --prevBoundary;
1636             }
1637         }
1638         limit=u_strchr(src, 0);
1639     }
1640 
1641     // Note: In this function we use buffer->appendZeroCC() because we track
1642     // the lead and trail combining classes here, rather than leaving it to
1643     // the ReorderingBuffer.
1644     // The exception is the call to decomposeShort() which uses the buffer
1645     // in the normal way.
1646 
1647     const UChar *prevSrc;
1648     UChar32 c=0;
1649     uint16_t fcd16=0;
1650 
1651     for(;;) {
1652         // count code units with lccc==0
1653         for(prevSrc=src; src!=limit;) {
1654             if((c=*src)<MIN_CCC_LCCC_CP) {
1655                 prevFCD16=~c;
1656                 ++src;
1657             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1658                 prevFCD16=0;
1659                 ++src;
1660             } else {
1661                 if(U16_IS_SURROGATE(c)) {
1662                     UChar c2;
1663                     if(U16_IS_SURROGATE_LEAD(c)) {
1664                         if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1665                             c=U16_GET_SUPPLEMENTARY(c, c2);
1666                         }
1667                     } else /* trail surrogate */ {
1668                         if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1669                             --src;
1670                             c=U16_GET_SUPPLEMENTARY(c2, c);
1671                         }
1672                     }
1673                 }
1674                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1675                     prevFCD16=fcd16;
1676                     src+=U16_LENGTH(c);
1677                 } else {
1678                     break;
1679                 }
1680             }
1681         }
1682         // copy these code units all at once
1683         if(src!=prevSrc) {
1684             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1685                 break;
1686             }
1687             if(src==limit) {
1688                 break;
1689             }
1690             prevBoundary=src;
1691             // We know that the previous character's lccc==0.
1692             if(prevFCD16<0) {
1693                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1694                 UChar32 prev=~prevFCD16;
1695                 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1696                 if(prevFCD16>1) {
1697                     --prevBoundary;
1698                 }
1699             } else {
1700                 const UChar *p=src-1;
1701                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1702                     --p;
1703                     // Need to fetch the previous character's FCD value because
1704                     // prevFCD16 was just for the trail surrogate code point.
1705                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
1706                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1707                 }
1708                 if(prevFCD16>1) {
1709                     prevBoundary=p;
1710                 }
1711             }
1712             // The start of the current character (c).
1713             prevSrc=src;
1714         } else if(src==limit) {
1715             break;
1716         }
1717 
1718         src+=U16_LENGTH(c);
1719         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1720         // Check for proper order, and decompose locally if necessary.
1721         if((prevFCD16&0xff)<=(fcd16>>8)) {
1722             // proper order: prev tccc <= current lccc
1723             if((fcd16&0xff)<=1) {
1724                 prevBoundary=src;
1725             }
1726             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1727                 break;
1728             }
1729             prevFCD16=fcd16;
1730             continue;
1731         } else if(buffer==NULL) {
1732             return prevBoundary;  // quick check "no"
1733         } else {
1734             /*
1735              * Back out the part of the source that we copied or appended
1736              * already but is now going to be decomposed.
1737              * prevSrc is set to after what was copied/appended.
1738              */
1739             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1740             /*
1741              * Find the part of the source that needs to be decomposed,
1742              * up to the next safe boundary.
1743              */
1744             src=findNextFCDBoundary(src, limit);
1745             /*
1746              * The source text does not fulfill the conditions for FCD.
1747              * Decompose and reorder a limited piece of the text.
1748              */
1749             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1750                 break;
1751             }
1752             prevBoundary=src;
1753             prevFCD16=0;
1754         }
1755     }
1756     return src;
1757 }
1758 
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1759 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1760                                        UBool doMakeFCD,
1761                                        UnicodeString &safeMiddle,
1762                                        ReorderingBuffer &buffer,
1763                                        UErrorCode &errorCode) const {
1764     if(!buffer.isEmpty()) {
1765         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1766         if(src!=firstBoundaryInSrc) {
1767             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1768                                                                     buffer.getLimit());
1769             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1770             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1771             buffer.removeSuffix(destSuffixLength);
1772             safeMiddle=middle;
1773             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1774             const UChar *middleStart=middle.getBuffer();
1775             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1776             if(U_FAILURE(errorCode)) {
1777                 return;
1778             }
1779             src=firstBoundaryInSrc;
1780         }
1781     }
1782     if(doMakeFCD) {
1783         makeFCD(src, limit, &buffer, errorCode);
1784     } else {
1785         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1786             limit=u_strchr(src, 0);
1787         }
1788         buffer.appendZeroCC(src, limit, errorCode);
1789     }
1790 }
1791 
findPreviousFCDBoundary(const UChar * start,const UChar * p) const1792 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1793     while(start<p && previousFCD16(start, p)>0xff) {}
1794     return p;
1795 }
1796 
findNextFCDBoundary(const UChar * p,const UChar * limit) const1797 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1798     while(p<limit) {
1799         const UChar *codePointStart=p;
1800         if(nextFCD16(p, limit)<=0xff) {
1801             return codePointStart;
1802         }
1803     }
1804     return p;
1805 }
1806 
1807 // CanonicalIterator data -------------------------------------------------- ***
1808 
CanonIterData(UErrorCode & errorCode)1809 CanonIterData::CanonIterData(UErrorCode &errorCode) :
1810         trie(utrie2_open(0, 0, &errorCode)),
1811         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
1812 
~CanonIterData()1813 CanonIterData::~CanonIterData() {
1814     utrie2_close(trie);
1815 }
1816 
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)1817 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1818     uint32_t canonValue=utrie2_get32(trie, decompLead);
1819     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1820         // origin is the first character whose decomposition starts with
1821         // the character for which we are setting the value.
1822         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1823     } else {
1824         // origin is not the first character, or it is U+0000.
1825         UnicodeSet *set;
1826         if((canonValue&CANON_HAS_SET)==0) {
1827             set=new UnicodeSet;
1828             if(set==NULL) {
1829                 errorCode=U_MEMORY_ALLOCATION_ERROR;
1830                 return;
1831             }
1832             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1833             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1834             utrie2_set32(trie, decompLead, canonValue, &errorCode);
1835             canonStartSets.addElement(set, errorCode);
1836             if(firstOrigin!=0) {
1837                 set->add(firstOrigin);
1838             }
1839         } else {
1840             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1841         }
1842         set->add(origin);
1843     }
1844 }
1845 
1846 U_CDECL_BEGIN
1847 
1848 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1849 //     context: the Normalizer2Impl
1850 static UBool U_CALLCONV
enumCIDRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)1851 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1852     UErrorCode errorCode = U_ZERO_ERROR;
1853     if (value != 0) {
1854         Normalizer2Impl *impl = (Normalizer2Impl *)context;
1855         impl->makeCanonIterDataFromNorm16(
1856             start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
1857     }
1858     return U_SUCCESS(errorCode);
1859 }
1860 
1861 
1862 
1863 // UInitOnce instantiation function for CanonIterData
1864 
1865 static void U_CALLCONV
initCanonIterData(Normalizer2Impl * impl,UErrorCode & errorCode)1866 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
1867     U_ASSERT(impl->fCanonIterData == NULL);
1868     impl->fCanonIterData = new CanonIterData(errorCode);
1869     if (impl->fCanonIterData == NULL) {
1870         errorCode=U_MEMORY_ALLOCATION_ERROR;
1871     }
1872     if (U_SUCCESS(errorCode)) {
1873         utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
1874         utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1875     }
1876     if (U_FAILURE(errorCode)) {
1877         delete impl->fCanonIterData;
1878         impl->fCanonIterData = NULL;
1879     }
1880 }
1881 
1882 U_CDECL_END
1883 
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const1884 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1885                                                   CanonIterData &newData,
1886                                                   UErrorCode &errorCode) const {
1887     if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1888         // Inert, or 2-way mapping (including Hangul syllable).
1889         // We do not write a canonStartSet for any yesNo character.
1890         // Composites from 2-way mappings are added at runtime from the
1891         // starter's compositions list, and the other characters in
1892         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1893         // "maybe" characters.
1894         return;
1895     }
1896     for(UChar32 c=start; c<=end; ++c) {
1897         uint32_t oldValue=utrie2_get32(newData.trie, c);
1898         uint32_t newValue=oldValue;
1899         if(norm16>=minMaybeYes) {
1900             // not a segment starter if it occurs in a decomposition or has cc!=0
1901             newValue|=CANON_NOT_SEGMENT_STARTER;
1902             if(norm16<MIN_NORMAL_MAYBE_YES) {
1903                 newValue|=CANON_HAS_COMPOSITIONS;
1904             }
1905         } else if(norm16<minYesNo) {
1906             newValue|=CANON_HAS_COMPOSITIONS;
1907         } else {
1908             // c has a one-way decomposition
1909             UChar32 c2=c;
1910             uint16_t norm16_2=norm16;
1911             while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1912                 c2=mapAlgorithmic(c2, norm16_2);
1913                 norm16_2=getNorm16(c2);
1914             }
1915             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1916                 // c decomposes, get everything from the variable-length extra data
1917                 const uint16_t *mapping=getMapping(norm16_2);
1918                 uint16_t firstUnit=*mapping;
1919                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1920                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1921                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
1922                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
1923                     }
1924                 }
1925                 // Skip empty mappings (no characters in the decomposition).
1926                 if(length!=0) {
1927                     ++mapping;  // skip over the firstUnit
1928                     // add c to first code point's start set
1929                     int32_t i=0;
1930                     U16_NEXT_UNSAFE(mapping, i, c2);
1931                     newData.addToStartSet(c, c2, errorCode);
1932                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1933                     // one-way mapping. A 2-way mapping is possible here after
1934                     // intermediate algorithmic mapping.
1935                     if(norm16_2>=minNoNo) {
1936                         while(i<length) {
1937                             U16_NEXT_UNSAFE(mapping, i, c2);
1938                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
1939                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1940                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1941                                              &errorCode);
1942                             }
1943                         }
1944                     }
1945                 }
1946             } else {
1947                 // c decomposed to c2 algorithmically; c has cc==0
1948                 newData.addToStartSet(c, c2, errorCode);
1949             }
1950         }
1951         if(newValue!=oldValue) {
1952             utrie2_set32(newData.trie, c, newValue, &errorCode);
1953         }
1954     }
1955 }
1956 
ensureCanonIterData(UErrorCode & errorCode) const1957 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1958     // Logically const: Synchronized instantiation.
1959     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1960     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
1961     return U_SUCCESS(errorCode);
1962 }
1963 
getCanonValue(UChar32 c) const1964 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1965     return (int32_t)utrie2_get32(fCanonIterData->trie, c);
1966 }
1967 
getCanonStartSet(int32_t n) const1968 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1969     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
1970 }
1971 
isCanonSegmentStarter(UChar32 c) const1972 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1973     return getCanonValue(c)>=0;
1974 }
1975 
getCanonStartSet(UChar32 c,UnicodeSet & set) const1976 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1977     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1978     if(canonValue==0) {
1979         return FALSE;
1980     }
1981     set.clear();
1982     int32_t value=canonValue&CANON_VALUE_MASK;
1983     if((canonValue&CANON_HAS_SET)!=0) {
1984         set.addAll(getCanonStartSet(value));
1985     } else if(value!=0) {
1986         set.add(value);
1987     }
1988     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1989         uint16_t norm16=getNorm16(c);
1990         if(norm16==JAMO_L) {
1991             UChar32 syllable=
1992                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1993             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1994         } else {
1995             addComposites(getCompositionsList(norm16), set);
1996         }
1997     }
1998     return TRUE;
1999 }
2000 
2001 U_NAMESPACE_END
2002 
2003 // Normalizer2 data swapping ----------------------------------------------- ***
2004 
2005 U_NAMESPACE_USE
2006 
2007 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)2008 unorm2_swap(const UDataSwapper *ds,
2009             const void *inData, int32_t length, void *outData,
2010             UErrorCode *pErrorCode) {
2011     const UDataInfo *pInfo;
2012     int32_t headerSize;
2013 
2014     const uint8_t *inBytes;
2015     uint8_t *outBytes;
2016 
2017     const int32_t *inIndexes;
2018     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
2019 
2020     int32_t i, offset, nextOffset, size;
2021 
2022     /* udata_swapDataHeader checks the arguments */
2023     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2024     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2025         return 0;
2026     }
2027 
2028     /* check data format and format version */
2029     pInfo=(const UDataInfo *)((const char *)inData+4);
2030     if(!(
2031         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
2032         pInfo->dataFormat[1]==0x72 &&
2033         pInfo->dataFormat[2]==0x6d &&
2034         pInfo->dataFormat[3]==0x32 &&
2035         (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
2036     )) {
2037         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2038                          pInfo->dataFormat[0], pInfo->dataFormat[1],
2039                          pInfo->dataFormat[2], pInfo->dataFormat[3],
2040                          pInfo->formatVersion[0]);
2041         *pErrorCode=U_UNSUPPORTED_ERROR;
2042         return 0;
2043     }
2044 
2045     inBytes=(const uint8_t *)inData+headerSize;
2046     outBytes=(uint8_t *)outData+headerSize;
2047 
2048     inIndexes=(const int32_t *)inBytes;
2049 
2050     if(length>=0) {
2051         length-=headerSize;
2052         if(length<(int32_t)sizeof(indexes)) {
2053             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2054                              length);
2055             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2056             return 0;
2057         }
2058     }
2059 
2060     /* read the first few indexes */
2061     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
2062         indexes[i]=udata_readInt32(ds, inIndexes[i]);
2063     }
2064 
2065     /* get the total length of the data */
2066     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2067 
2068     if(length>=0) {
2069         if(length<size) {
2070             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2071                              length);
2072             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2073             return 0;
2074         }
2075 
2076         /* copy the data for inaccessible bytes */
2077         if(inBytes!=outBytes) {
2078             uprv_memcpy(outBytes, inBytes, size);
2079         }
2080 
2081         offset=0;
2082 
2083         /* swap the int32_t indexes[] */
2084         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2085         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2086         offset=nextOffset;
2087 
2088         /* swap the UTrie2 */
2089         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2090         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2091         offset=nextOffset;
2092 
2093         /* swap the uint16_t extraData[] */
2094         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2095         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2096         offset=nextOffset;
2097 
2098         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2099         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2100         offset=nextOffset;
2101 
2102         U_ASSERT(offset==size);
2103     }
2104 
2105     return headerSize+size;
2106 }
2107 
2108 #endif  // !UCONFIG_NO_NORMALIZATION
2109