1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  normalizer2impl.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov22
16 *   created by: Markus W. Scherer
17 */
18 
19 // #define UCPTRIE_DEBUG
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_NORMALIZATION
24 
25 #include "unicode/bytestream.h"
26 #include "unicode/edits.h"
27 #include "unicode/normalizer2.h"
28 #include "unicode/stringoptions.h"
29 #include "unicode/ucptrie.h"
30 #include "unicode/udata.h"
31 #include "unicode/umutablecptrie.h"
32 #include "unicode/ustring.h"
33 #include "unicode/utf16.h"
34 #include "unicode/utf8.h"
35 #include "bytesinkutil.h"
36 #include "cmemory.h"
37 #include "mutex.h"
38 #include "normalizer2impl.h"
39 #include "putilimp.h"
40 #include "uassert.h"
41 #include "ucptrie_impl.h"
42 #include "uset_imp.h"
43 #include "uvector.h"
44 
45 U_NAMESPACE_BEGIN
46 
47 namespace {
48 
49 /**
50  * UTF-8 lead byte for minNoMaybeCP.
51  * Can be lower than the actual lead byte for c.
52  * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
53  */
leadByteForCP(UChar32 c)54 inline uint8_t leadByteForCP(UChar32 c) {
55     if (c <= 0x7f) {
56         return (uint8_t)c;
57     } else if (c <= 0x7ff) {
58         return (uint8_t)(0xc0+(c>>6));
59     } else {
60         // Should not occur because ccc(U+0300)!=0.
61         return 0xe0;
62     }
63 }
64 
65 /**
66  * Returns the code point from one single well-formed UTF-8 byte sequence
67  * between cpStart and cpLimit.
68  *
69  * Trie UTF-8 macros do not assemble whole code points (for efficiency).
70  * When we do need the code point, we call this function.
71  * We should not need it for normalization-inert data (norm16==0).
72  * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
73  */
codePointFromValidUTF8(const uint8_t * cpStart,const uint8_t * cpLimit)74 UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) {
75     // Similar to U8_NEXT_UNSAFE(s, i, c).
76     U_ASSERT(cpStart < cpLimit);
77     uint8_t c = *cpStart;
78     switch(cpLimit-cpStart) {
79     case 1:
80         return c;
81     case 2:
82         return ((c&0x1f)<<6) | (cpStart[1]&0x3f);
83     case 3:
84         // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
85         return (UChar)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f));
86     case 4:
87         return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);
88     default:
89         UPRV_UNREACHABLE;  // Should not occur.
90     }
91 }
92 
93 /**
94  * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
95  * Otherwise returns a negative value.
96  */
previousHangulOrJamo(const uint8_t * start,const uint8_t * p)97 UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) {
98     if ((p - start) >= 3) {
99         p -= 3;
100         uint8_t l = *p;
101         uint8_t t1, t2;
102         if (0xe1 <= l && l <= 0xed &&
103                 (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f &&
104                 (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f &&
105                 (l < 0xed || t1 <= 0x1f)) {
106             return ((l & 0xf) << 12) | (t1 << 6) | t2;
107         }
108     }
109     return U_SENTINEL;
110 }
111 
112 /**
113  * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
114  * Otherwise returns a negative value.
115  */
getJamoTMinusBase(const uint8_t * src,const uint8_t * limit)116 int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
117     // Jamo T: E1 86 A8..E1 87 82
118     if ((limit - src) >= 3 && *src == 0xe1) {
119         if (src[1] == 0x86) {
120             uint8_t t = src[2];
121             // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
122             // Offset 0 does not correspond to any conjoining Jamo.
123             if (0xa8 <= t && t <= 0xbf) {
124                 return t - 0xa7;
125             }
126         } else if (src[1] == 0x87) {
127             uint8_t t = src[2];
128             if ((int8_t)t <= (int8_t)0x82u) {
129                 return t - (0xa7 - 0x40);
130             }
131         }
132     }
133     return -1;
134 }
135 
136 void
appendCodePointDelta(const uint8_t * cpStart,const uint8_t * cpLimit,int32_t delta,ByteSink & sink,Edits * edits)137 appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
138                      ByteSink &sink, Edits *edits) {
139     char buffer[U8_MAX_LENGTH];
140     int32_t length;
141     int32_t cpLength = (int32_t)(cpLimit - cpStart);
142     if (cpLength == 1) {
143         // The builder makes ASCII map to ASCII.
144         buffer[0] = (uint8_t)(*cpStart + delta);
145         length = 1;
146     } else {
147         int32_t trail = *(cpLimit-1) + delta;
148         if (0x80 <= trail && trail <= 0xbf) {
149             // The delta only changes the last trail byte.
150             --cpLimit;
151             length = 0;
152             do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);
153             buffer[length++] = (uint8_t)trail;
154         } else {
155             // Decode the code point, add the delta, re-encode.
156             UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
157             length = 0;
158             U8_APPEND_UNSAFE(buffer, length, c);
159         }
160     }
161     if (edits != nullptr) {
162         edits->addReplace(cpLength, length);
163     }
164     sink.Append(buffer, length);
165 }
166 
167 }  // namespace
168 
169 // ReorderingBuffer -------------------------------------------------------- ***
170 
ReorderingBuffer(const Normalizer2Impl & ni,UnicodeString & dest,UErrorCode & errorCode)171 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,
172                                    UErrorCode &errorCode) :
173         impl(ni), str(dest),
174         start(str.getBuffer(8)), reorderStart(start), limit(start),
175         remainingCapacity(str.getCapacity()), lastCC(0) {
176     if (start == nullptr && U_SUCCESS(errorCode)) {
177         // getBuffer() already did str.setToBogus()
178         errorCode = U_MEMORY_ALLOCATION_ERROR;
179     }
180 }
181 
init(int32_t destCapacity,UErrorCode & errorCode)182 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
183     int32_t length=str.length();
184     start=str.getBuffer(destCapacity);
185     if(start==NULL) {
186         // getBuffer() already did str.setToBogus()
187         errorCode=U_MEMORY_ALLOCATION_ERROR;
188         return FALSE;
189     }
190     limit=start+length;
191     remainingCapacity=str.getCapacity()-length;
192     reorderStart=start;
193     if(start==limit) {
194         lastCC=0;
195     } else {
196         setIterator();
197         lastCC=previousCC();
198         // Set reorderStart after the last code point with cc<=1 if there is one.
199         if(lastCC>1) {
200             while(previousCC()>1) {}
201         }
202         reorderStart=codePointLimit;
203     }
204     return TRUE;
205 }
206 
equals(const UChar * otherStart,const UChar * otherLimit) const207 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
208     int32_t length=(int32_t)(limit-start);
209     return
210         length==(int32_t)(otherLimit-otherStart) &&
211         0==u_memcmp(start, otherStart, length);
212 }
213 
equals(const uint8_t * otherStart,const uint8_t * otherLimit) const214 UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
215     U_ASSERT((otherLimit - otherStart) <= INT32_MAX);  // ensured by caller
216     int32_t length = (int32_t)(limit - start);
217     int32_t otherLength = (int32_t)(otherLimit - otherStart);
218     // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
219     if (otherLength < length || (otherLength / 3) > length) {
220         return FALSE;
221     }
222     // Compare valid strings from between normalization boundaries.
223     // (Invalid sequences are normalization-inert.)
224     for (int32_t i = 0, j = 0;;) {
225         if (i >= length) {
226             return j >= otherLength;
227         } else if (j >= otherLength) {
228             return FALSE;
229         }
230         // Not at the end of either string yet.
231         UChar32 c, other;
232         U16_NEXT_UNSAFE(start, i, c);
233         U8_NEXT_UNSAFE(otherStart, j, other);
234         if (c != other) {
235             return FALSE;
236         }
237     }
238 }
239 
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)240 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
241     if(remainingCapacity<2 && !resize(2, errorCode)) {
242         return FALSE;
243     }
244     if(lastCC<=cc || cc==0) {
245         limit[0]=U16_LEAD(c);
246         limit[1]=U16_TRAIL(c);
247         limit+=2;
248         lastCC=cc;
249         if(cc<=1) {
250             reorderStart=limit;
251         }
252     } else {
253         insert(c, cc);
254     }
255     remainingCapacity-=2;
256     return TRUE;
257 }
258 
append(const UChar * s,int32_t length,UBool isNFD,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)259 UBool ReorderingBuffer::append(const UChar *s, int32_t length, UBool isNFD,
260                                uint8_t leadCC, uint8_t trailCC,
261                                UErrorCode &errorCode) {
262     if(length==0) {
263         return TRUE;
264     }
265     if(remainingCapacity<length && !resize(length, errorCode)) {
266         return FALSE;
267     }
268     remainingCapacity-=length;
269     if(lastCC<=leadCC || leadCC==0) {
270         if(trailCC<=1) {
271             reorderStart=limit+length;
272         } else if(leadCC<=1) {
273             reorderStart=limit+1;  // Ok if not a code point boundary.
274         }
275         const UChar *sLimit=s+length;
276         do { *limit++=*s++; } while(s!=sLimit);
277         lastCC=trailCC;
278     } else {
279         int32_t i=0;
280         UChar32 c;
281         U16_NEXT(s, i, length, c);
282         insert(c, leadCC);  // insert first code point
283         while(i<length) {
284             U16_NEXT(s, i, length, c);
285             if(i<length) {
286                 if (isNFD) {
287                     leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c));
288                 } else {
289                     leadCC = impl.getCC(impl.getNorm16(c));
290                 }
291             } else {
292                 leadCC=trailCC;
293             }
294             append(c, leadCC, errorCode);
295         }
296     }
297     return TRUE;
298 }
299 
appendZeroCC(UChar32 c,UErrorCode & errorCode)300 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
301     int32_t cpLength=U16_LENGTH(c);
302     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
303         return FALSE;
304     }
305     remainingCapacity-=cpLength;
306     if(cpLength==1) {
307         *limit++=(UChar)c;
308     } else {
309         limit[0]=U16_LEAD(c);
310         limit[1]=U16_TRAIL(c);
311         limit+=2;
312     }
313     lastCC=0;
314     reorderStart=limit;
315     return TRUE;
316 }
317 
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)318 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
319     if(s==sLimit) {
320         return TRUE;
321     }
322     int32_t length=(int32_t)(sLimit-s);
323     if(remainingCapacity<length && !resize(length, errorCode)) {
324         return FALSE;
325     }
326     u_memcpy(limit, s, length);
327     limit+=length;
328     remainingCapacity-=length;
329     lastCC=0;
330     reorderStart=limit;
331     return TRUE;
332 }
333 
remove()334 void ReorderingBuffer::remove() {
335     reorderStart=limit=start;
336     remainingCapacity=str.getCapacity();
337     lastCC=0;
338 }
339 
removeSuffix(int32_t suffixLength)340 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
341     if(suffixLength<(limit-start)) {
342         limit-=suffixLength;
343         remainingCapacity+=suffixLength;
344     } else {
345         limit=start;
346         remainingCapacity=str.getCapacity();
347     }
348     lastCC=0;
349     reorderStart=limit;
350 }
351 
resize(int32_t appendLength,UErrorCode & errorCode)352 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
353     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
354     int32_t length=(int32_t)(limit-start);
355     str.releaseBuffer(length);
356     int32_t newCapacity=length+appendLength;
357     int32_t doubleCapacity=2*str.getCapacity();
358     if(newCapacity<doubleCapacity) {
359         newCapacity=doubleCapacity;
360     }
361     if(newCapacity<256) {
362         newCapacity=256;
363     }
364     start=str.getBuffer(newCapacity);
365     if(start==NULL) {
366         // getBuffer() already did str.setToBogus()
367         errorCode=U_MEMORY_ALLOCATION_ERROR;
368         return FALSE;
369     }
370     reorderStart=start+reorderStartIndex;
371     limit=start+length;
372     remainingCapacity=str.getCapacity()-length;
373     return TRUE;
374 }
375 
skipPrevious()376 void ReorderingBuffer::skipPrevious() {
377     codePointLimit=codePointStart;
378     UChar c=*--codePointStart;
379     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
380         --codePointStart;
381     }
382 }
383 
previousCC()384 uint8_t ReorderingBuffer::previousCC() {
385     codePointLimit=codePointStart;
386     if(reorderStart>=codePointStart) {
387         return 0;
388     }
389     UChar32 c=*--codePointStart;
390     UChar c2;
391     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
392         --codePointStart;
393         c=U16_GET_SUPPLEMENTARY(c2, c);
394     }
395     return impl.getCCFromYesOrMaybeCP(c);
396 }
397 
398 // Inserts c somewhere before the last character.
399 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)400 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
401     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
402     // insert c at codePointLimit, after the character with prevCC<=cc
403     UChar *q=limit;
404     UChar *r=limit+=U16_LENGTH(c);
405     do {
406         *--r=*--q;
407     } while(codePointLimit!=q);
408     writeCodePoint(q, c);
409     if(cc<=1) {
410         reorderStart=r;
411     }
412 }
413 
414 // Normalizer2Impl --------------------------------------------------------- ***
415 
416 struct CanonIterData : public UMemory {
417     CanonIterData(UErrorCode &errorCode);
418     ~CanonIterData();
419     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
420     UMutableCPTrie *mutableTrie;
421     UCPTrie *trie;
422     UVector canonStartSets;  // contains UnicodeSet *
423 };
424 
~Normalizer2Impl()425 Normalizer2Impl::~Normalizer2Impl() {
426     delete fCanonIterData;
427 }
428 
429 void
init(const int32_t * inIndexes,const UCPTrie * inTrie,const uint16_t * inExtraData,const uint8_t * inSmallFCD)430 Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,
431                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
432     minDecompNoCP = static_cast<UChar>(inIndexes[IX_MIN_DECOMP_NO_CP]);
433     minCompNoMaybeCP = static_cast<UChar>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);
434     minLcccCP = static_cast<UChar>(inIndexes[IX_MIN_LCCC_CP]);
435 
436     minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);
437     minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);
438     minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);
439     minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
440     minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
441     minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);
442     limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);
443     minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);
444     U_ASSERT((minMaybeYes & 7) == 0);  // 8-aligned for noNoDelta bit fields
445     centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1;
446 
447     normTrie=inTrie;
448 
449     maybeYesCompositions=inExtraData;
450     extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
451 
452     smallFCD=inSmallFCD;
453 }
454 
455 U_CDECL_BEGIN
456 
457 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)458 segmentStarterMapper(const void * /*context*/, uint32_t value) {
459     return value&CANON_NOT_SEGMENT_STARTER;
460 }
461 
462 U_CDECL_END
463 
464 void
addLcccChars(UnicodeSet & set) const465 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
466     UChar32 start = 0, end;
467     uint32_t norm16;
468     while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
469                                    nullptr, nullptr, &norm16)) >= 0) {
470         if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
471                 norm16 != Normalizer2Impl::JAMO_VT) {
472             set.add(start, end);
473         } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
474             uint16_t fcd16 = getFCD16(start);
475             if (fcd16 > 0xff) { set.add(start, end); }
476         }
477         start = end + 1;
478     }
479 }
480 
481 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const482 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
483     // Add the start code point of each same-value range of the trie.
484     UChar32 start = 0, end;
485     uint32_t value;
486     while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
487                                    nullptr, nullptr, &value)) >= 0) {
488         sa->add(sa->set, start);
489         if (start != end && isAlgorithmicNoNo((uint16_t)value) &&
490                 (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
491             // Range of code points with same-norm16-value algorithmic decompositions.
492             // They might have different non-zero FCD16 values.
493             uint16_t prevFCD16 = getFCD16(start);
494             while (++start <= end) {
495                 uint16_t fcd16 = getFCD16(start);
496                 if (fcd16 != prevFCD16) {
497                     sa->add(sa->set, start);
498                     prevFCD16 = fcd16;
499                 }
500             }
501         }
502         start = end + 1;
503     }
504 
505     /* add Hangul LV syllables and LV+1 because of skippables */
506     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
507         sa->add(sa->set, c);
508         sa->add(sa->set, c+1);
509     }
510     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
511 }
512 
513 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const514 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
515     // Add the start code point of each same-value range of the canonical iterator data trie.
516     if (!ensureCanonIterData(errorCode)) { return; }
517     // Currently only used for the SEGMENT_STARTER property.
518     UChar32 start = 0, end;
519     uint32_t value;
520     while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
521                                    segmentStarterMapper, nullptr, &value)) >= 0) {
522         sa->add(sa->set, start);
523         start = end + 1;
524     }
525 }
526 
527 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const528 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
529                                                 UChar32 minNeedDataCP,
530                                                 ReorderingBuffer *buffer,
531                                                 UErrorCode &errorCode) const {
532     // Make some effort to support NUL-terminated strings reasonably.
533     // Take the part of the fast quick check loop that does not look up
534     // data and check the first part of the string.
535     // After this prefix, determine the string length to simplify the rest
536     // of the code.
537     const UChar *prevSrc=src;
538     UChar c;
539     while((c=*src++)<minNeedDataCP && c!=0) {}
540     // Back out the last character for full processing.
541     // Copy this prefix.
542     if(--src!=prevSrc) {
543         if(buffer!=NULL) {
544             buffer->appendZeroCC(prevSrc, src, errorCode);
545         }
546     }
547     return src;
548 }
549 
550 UnicodeString &
decompose(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const551 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
552                            UErrorCode &errorCode) const {
553     if(U_FAILURE(errorCode)) {
554         dest.setToBogus();
555         return dest;
556     }
557     const UChar *sArray=src.getBuffer();
558     if(&dest==&src || sArray==NULL) {
559         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
560         dest.setToBogus();
561         return dest;
562     }
563     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
564     return dest;
565 }
566 
567 void
decompose(const UChar * src,const UChar * limit,UnicodeString & dest,int32_t destLengthEstimate,UErrorCode & errorCode) const568 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
569                            UnicodeString &dest,
570                            int32_t destLengthEstimate,
571                            UErrorCode &errorCode) const {
572     if(destLengthEstimate<0 && limit!=NULL) {
573         destLengthEstimate=(int32_t)(limit-src);
574     }
575     dest.remove();
576     ReorderingBuffer buffer(*this, dest);
577     if(buffer.init(destLengthEstimate, errorCode)) {
578         decompose(src, limit, &buffer, errorCode);
579     }
580 }
581 
582 // Dual functionality:
583 // buffer!=NULL: normalize
584 // buffer==NULL: isNormalized/spanQuickCheckYes
585 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const586 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
587                            ReorderingBuffer *buffer,
588                            UErrorCode &errorCode) const {
589     UChar32 minNoCP=minDecompNoCP;
590     if(limit==NULL) {
591         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
592         if(U_FAILURE(errorCode)) {
593             return src;
594         }
595         limit=u_strchr(src, 0);
596     }
597 
598     const UChar *prevSrc;
599     UChar32 c=0;
600     uint16_t norm16=0;
601 
602     // only for quick check
603     const UChar *prevBoundary=src;
604     uint8_t prevCC=0;
605 
606     for(;;) {
607         // count code units below the minimum or with irrelevant data for the quick check
608         for(prevSrc=src; src!=limit;) {
609             if( (c=*src)<minNoCP ||
610                 isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
611             ) {
612                 ++src;
613             } else if(!U16_IS_LEAD(c)) {
614                 break;
615             } else {
616                 UChar c2;
617                 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
618                     c=U16_GET_SUPPLEMENTARY(c, c2);
619                     norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
620                     if(isMostDecompYesAndZeroCC(norm16)) {
621                         src+=2;
622                     } else {
623                         break;
624                     }
625                 } else {
626                     ++src;  // unpaired lead surrogate: inert
627                 }
628             }
629         }
630         // copy these code units all at once
631         if(src!=prevSrc) {
632             if(buffer!=NULL) {
633                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
634                     break;
635                 }
636             } else {
637                 prevCC=0;
638                 prevBoundary=src;
639             }
640         }
641         if(src==limit) {
642             break;
643         }
644 
645         // Check one above-minimum, relevant code point.
646         src+=U16_LENGTH(c);
647         if(buffer!=NULL) {
648             if(!decompose(c, norm16, *buffer, errorCode)) {
649                 break;
650             }
651         } else {
652             if(isDecompYes(norm16)) {
653                 uint8_t cc=getCCFromYesOrMaybe(norm16);
654                 if(prevCC<=cc || cc==0) {
655                     prevCC=cc;
656                     if(cc<=1) {
657                         prevBoundary=src;
658                     }
659                     continue;
660                 }
661             }
662             return prevBoundary;  // "no" or cc out of order
663         }
664     }
665     return src;
666 }
667 
668 // Decompose a short piece of text which is likely to contain characters that
669 // fail the quick check loop and/or where the quick check loop's overhead
670 // is unlikely to be amortized.
671 // Called by the compose() and makeFCD() implementations.
672 const UChar *
decomposeShort(const UChar * src,const UChar * limit,UBool stopAtCompBoundary,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const673 Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
674                                 UBool stopAtCompBoundary, UBool onlyContiguous,
675                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
676     if (U_FAILURE(errorCode)) {
677         return nullptr;
678     }
679     while(src<limit) {
680         if (stopAtCompBoundary && *src < minCompNoMaybeCP) {
681             return src;
682         }
683         const UChar *prevSrc = src;
684         UChar32 c;
685         uint16_t norm16;
686         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
687         if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
688             return prevSrc;
689         }
690         if(!decompose(c, norm16, buffer, errorCode)) {
691             return nullptr;
692         }
693         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
694             return src;
695         }
696     }
697     return src;
698 }
699 
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const700 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
701                                  ReorderingBuffer &buffer,
702                                  UErrorCode &errorCode) const {
703     // get the decomposition and the lead and trail cc's
704     if (norm16 >= limitNoNo) {
705         if (isMaybeOrNonZeroCC(norm16)) {
706             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
707         }
708         // Maps to an isCompYesAndZeroCC.
709         c=mapAlgorithmic(c, norm16);
710         norm16=getRawNorm16(c);
711     }
712     if (norm16 < minYesNo) {
713         // c does not decompose
714         return buffer.append(c, 0, errorCode);
715     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
716         // Hangul syllable: decompose algorithmically
717         UChar jamos[3];
718         return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
719     }
720     // c decomposes, get everything from the variable-length extra data
721     const uint16_t *mapping=getMapping(norm16);
722     uint16_t firstUnit=*mapping;
723     int32_t length=firstUnit&MAPPING_LENGTH_MASK;
724     uint8_t leadCC, trailCC;
725     trailCC=(uint8_t)(firstUnit>>8);
726     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
727         leadCC=(uint8_t)(*(mapping-1)>>8);
728     } else {
729         leadCC=0;
730     }
731     return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
732 }
733 
734 const uint8_t *
decomposeShort(const uint8_t * src,const uint8_t * limit,UBool stopAtCompBoundary,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const735 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
736                                 UBool stopAtCompBoundary, UBool onlyContiguous,
737                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
738     if (U_FAILURE(errorCode)) {
739         return nullptr;
740     }
741     while (src < limit) {
742         const uint8_t *prevSrc = src;
743         uint16_t norm16;
744         UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
745         // Get the decomposition and the lead and trail cc's.
746         UChar32 c = U_SENTINEL;
747         if (norm16 >= limitNoNo) {
748             if (isMaybeOrNonZeroCC(norm16)) {
749                 // No boundaries around this character.
750                 c = codePointFromValidUTF8(prevSrc, src);
751                 if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
752                     return nullptr;
753                 }
754                 continue;
755             }
756             // Maps to an isCompYesAndZeroCC.
757             if (stopAtCompBoundary) {
758                 return prevSrc;
759             }
760             c = codePointFromValidUTF8(prevSrc, src);
761             c = mapAlgorithmic(c, norm16);
762             norm16 = getRawNorm16(c);
763         } else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
764             return prevSrc;
765         }
766         // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
767         // We do not see invalid UTF-8 here because
768         // its norm16==INERT is normalization-inert,
769         // so it gets copied unchanged in the fast path,
770         // and we stop the slow path where invalid UTF-8 begins.
771         U_ASSERT(norm16 != INERT);
772         if (norm16 < minYesNo) {
773             if (c < 0) {
774                 c = codePointFromValidUTF8(prevSrc, src);
775             }
776             // does not decompose
777             if (!buffer.append(c, 0, errorCode)) {
778                 return nullptr;
779             }
780         } else if (isHangulLV(norm16) || isHangulLVT(norm16)) {
781             // Hangul syllable: decompose algorithmically
782             if (c < 0) {
783                 c = codePointFromValidUTF8(prevSrc, src);
784             }
785             char16_t jamos[3];
786             if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
787                 return nullptr;
788             }
789         } else {
790             // The character decomposes, get everything from the variable-length extra data.
791             const uint16_t *mapping = getMapping(norm16);
792             uint16_t firstUnit = *mapping;
793             int32_t length = firstUnit & MAPPING_LENGTH_MASK;
794             uint8_t trailCC = (uint8_t)(firstUnit >> 8);
795             uint8_t leadCC;
796             if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
797                 leadCC = (uint8_t)(*(mapping-1) >> 8);
798             } else {
799                 leadCC = 0;
800             }
801             if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
802                 return nullptr;
803             }
804         }
805         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
806             return src;
807         }
808     }
809     return src;
810 }
811 
812 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const813 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
814     uint16_t norm16;
815     if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
816         // c does not decompose
817         return nullptr;
818     }
819     const UChar *decomp = nullptr;
820     if(isDecompNoAlgorithmic(norm16)) {
821         // Maps to an isCompYesAndZeroCC.
822         c=mapAlgorithmic(c, norm16);
823         decomp=buffer;
824         length=0;
825         U16_APPEND_UNSAFE(buffer, length, c);
826         // The mapping might decompose further.
827         norm16 = getRawNorm16(c);
828     }
829     if (norm16 < minYesNo) {
830         return decomp;
831     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
832         // Hangul syllable: decompose algorithmically
833         length=Hangul::decompose(c, buffer);
834         return buffer;
835     }
836     // c decomposes, get everything from the variable-length extra data
837     const uint16_t *mapping=getMapping(norm16);
838     length=*mapping&MAPPING_LENGTH_MASK;
839     return (const UChar *)mapping+1;
840 }
841 
842 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
843 // so that a raw mapping fits that consists of one unit ("rm0")
844 // plus all but the first two code units of the normal mapping.
845 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
846 const UChar *
getRawDecomposition(UChar32 c,UChar buffer[30],int32_t & length) const847 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
848     uint16_t norm16;
849     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
850         // c does not decompose
851         return NULL;
852     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
853         // Hangul syllable: decompose algorithmically
854         Hangul::getRawDecomposition(c, buffer);
855         length=2;
856         return buffer;
857     } else if(isDecompNoAlgorithmic(norm16)) {
858         c=mapAlgorithmic(c, norm16);
859         length=0;
860         U16_APPEND_UNSAFE(buffer, length, c);
861         return buffer;
862     }
863     // c decomposes, get everything from the variable-length extra data
864     const uint16_t *mapping=getMapping(norm16);
865     uint16_t firstUnit=*mapping;
866     int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
867     if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
868         // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
869         // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
870         const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
871         uint16_t rm0=*rawMapping;
872         if(rm0<=MAPPING_LENGTH_MASK) {
873             length=rm0;
874             return (const UChar *)rawMapping-rm0;
875         } else {
876             // Copy the normal mapping and replace its first two code units with rm0.
877             buffer[0]=(UChar)rm0;
878             u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
879             length=mLength-1;
880             return buffer;
881         }
882     } else {
883         length=mLength;
884         return (const UChar *)mapping+1;
885     }
886 }
887 
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const888 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
889                                          UBool doDecompose,
890                                          UnicodeString &safeMiddle,
891                                          ReorderingBuffer &buffer,
892                                          UErrorCode &errorCode) const {
893     buffer.copyReorderableSuffixTo(safeMiddle);
894     if(doDecompose) {
895         decompose(src, limit, &buffer, errorCode);
896         return;
897     }
898     // Just merge the strings at the boundary.
899     bool isFirst = true;
900     uint8_t firstCC = 0, prevCC = 0, cc;
901     const UChar *p = src;
902     while (p != limit) {
903         const UChar *codePointStart = p;
904         UChar32 c;
905         uint16_t norm16;
906         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
907         if ((cc = getCC(norm16)) == 0) {
908             p = codePointStart;
909             break;
910         }
911         if (isFirst) {
912             firstCC = cc;
913             isFirst = false;
914         }
915         prevCC = cc;
916     }
917     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
918         limit=u_strchr(p, 0);
919     }
920 
921     if (buffer.append(src, (int32_t)(p - src), FALSE, firstCC, prevCC, errorCode)) {
922         buffer.appendZeroCC(p, limit, errorCode);
923     }
924 }
925 
hasDecompBoundaryBefore(UChar32 c) const926 UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {
927     return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
928         norm16HasDecompBoundaryBefore(getNorm16(c));
929 }
930 
norm16HasDecompBoundaryBefore(uint16_t norm16) const931 UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {
932     if (norm16 < minNoNoCompNoMaybeCC) {
933         return TRUE;
934     }
935     if (norm16 >= limitNoNo) {
936         return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
937     }
938     // c decomposes, get everything from the variable-length extra data
939     const uint16_t *mapping=getMapping(norm16);
940     uint16_t firstUnit=*mapping;
941     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
942     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
943 }
944 
hasDecompBoundaryAfter(UChar32 c) const945 UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {
946     if (c < minDecompNoCP) {
947         return TRUE;
948     }
949     if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
950         return TRUE;
951     }
952     return norm16HasDecompBoundaryAfter(getNorm16(c));
953 }
954 
norm16HasDecompBoundaryAfter(uint16_t norm16) const955 UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {
956     if(norm16 <= minYesNo || isHangulLVT(norm16)) {
957         return TRUE;
958     }
959     if (norm16 >= limitNoNo) {
960         if (isMaybeOrNonZeroCC(norm16)) {
961             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
962         }
963         // Maps to an isCompYesAndZeroCC.
964         return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
965     }
966     // c decomposes, get everything from the variable-length extra data
967     const uint16_t *mapping=getMapping(norm16);
968     uint16_t firstUnit=*mapping;
969     // decomp after-boundary: same as hasFCDBoundaryAfter(),
970     // fcd16<=1 || trailCC==0
971     if(firstUnit>0x1ff) {
972         return FALSE;  // trailCC>1
973     }
974     if(firstUnit<=0xff) {
975         return TRUE;  // trailCC==0
976     }
977     // if(trailCC==1) test leadCC==0, same as checking for before-boundary
978     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
979     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
980 }
981 
982 /*
983  * Finds the recomposition result for
984  * a forward-combining "lead" character,
985  * specified with a pointer to its compositions list,
986  * and a backward-combining "trail" character.
987  *
988  * If the lead and trail characters combine, then this function returns
989  * the following "compositeAndFwd" value:
990  * Bits 21..1  composite character
991  * Bit      0  set if the composite is a forward-combining starter
992  * otherwise it returns -1.
993  *
994  * The compositions list has (trail, compositeAndFwd) pair entries,
995  * encoded as either pairs or triples of 16-bit units.
996  * The last entry has the high bit of its first unit set.
997  *
998  * The list is sorted by ascending trail characters (there are no duplicates).
999  * A linear search is used.
1000  *
1001  * See normalizer2impl.h for a more detailed description
1002  * of the compositions list format.
1003  */
combine(const uint16_t * list,UChar32 trail)1004 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
1005     uint16_t key1, firstUnit;
1006     if(trail<COMP_1_TRAIL_LIMIT) {
1007         // trail character is 0..33FF
1008         // result entry may have 2 or 3 units
1009         key1=(uint16_t)(trail<<1);
1010         while(key1>(firstUnit=*list)) {
1011             list+=2+(firstUnit&COMP_1_TRIPLE);
1012         }
1013         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1014             if(firstUnit&COMP_1_TRIPLE) {
1015                 return ((int32_t)list[1]<<16)|list[2];
1016             } else {
1017                 return list[1];
1018             }
1019         }
1020     } else {
1021         // trail character is 3400..10FFFF
1022         // result entry has 3 units
1023         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
1024                         (((trail>>COMP_1_TRAIL_SHIFT))&
1025                           ~COMP_1_TRIPLE));
1026         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
1027         uint16_t secondUnit;
1028         for(;;) {
1029             if(key1>(firstUnit=*list)) {
1030                 list+=2+(firstUnit&COMP_1_TRIPLE);
1031             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1032                 if(key2>(secondUnit=list[1])) {
1033                     if(firstUnit&COMP_1_LAST_TUPLE) {
1034                         break;
1035                     } else {
1036                         list+=3;
1037                     }
1038                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1039                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
1040                 } else {
1041                     break;
1042                 }
1043             } else {
1044                 break;
1045             }
1046         }
1047     }
1048     return -1;
1049 }
1050 
1051 /**
1052   * @param list some character's compositions list
1053   * @param set recursively receives the composites from these compositions
1054   */
addComposites(const uint16_t * list,UnicodeSet & set) const1055 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
1056     uint16_t firstUnit;
1057     int32_t compositeAndFwd;
1058     do {
1059         firstUnit=*list;
1060         if((firstUnit&COMP_1_TRIPLE)==0) {
1061             compositeAndFwd=list[1];
1062             list+=2;
1063         } else {
1064             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
1065             list+=3;
1066         }
1067         UChar32 composite=compositeAndFwd>>1;
1068         if((compositeAndFwd&1)!=0) {
1069             addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
1070         }
1071         set.add(composite);
1072     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1073 }
1074 
1075 /*
1076  * Recomposes the buffer text starting at recomposeStartIndex
1077  * (which is in NFD - decomposed and canonically ordered),
1078  * and truncates the buffer contents.
1079  *
1080  * Note that recomposition never lengthens the text:
1081  * Any character consists of either one or two code units;
1082  * a composition may contain at most one more code unit than the original starter,
1083  * while the combining mark that is removed has at least one code unit.
1084  */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const1085 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
1086                                 UBool onlyContiguous) const {
1087     UChar *p=buffer.getStart()+recomposeStartIndex;
1088     UChar *limit=buffer.getLimit();
1089     if(p==limit) {
1090         return;
1091     }
1092 
1093     UChar *starter, *pRemove, *q, *r;
1094     const uint16_t *compositionsList;
1095     UChar32 c, compositeAndFwd;
1096     uint16_t norm16;
1097     uint8_t cc, prevCC;
1098     UBool starterIsSupplementary;
1099 
1100     // Some of the following variables are not used until we have a forward-combining starter
1101     // and are only initialized now to avoid compiler warnings.
1102     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
1103     starter=NULL;
1104     starterIsSupplementary=FALSE;
1105     prevCC=0;
1106 
1107     for(;;) {
1108         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
1109         cc=getCCFromYesOrMaybe(norm16);
1110         if( // this character combines backward and
1111             isMaybe(norm16) &&
1112             // we have seen a starter that combines forward and
1113             compositionsList!=NULL &&
1114             // the backward-combining character is not blocked
1115             (prevCC<cc || prevCC==0)
1116         ) {
1117             if(isJamoVT(norm16)) {
1118                 // c is a Jamo V/T, see if we can compose it with the previous character.
1119                 if(c<Hangul::JAMO_T_BASE) {
1120                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1121                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
1122                     if(prev<Hangul::JAMO_L_COUNT) {
1123                         pRemove=p-1;
1124                         UChar syllable=(UChar)
1125                             (Hangul::HANGUL_BASE+
1126                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1127                              Hangul::JAMO_T_COUNT);
1128                         UChar t;
1129                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1130                             ++p;
1131                             syllable+=t;  // The next character was a Jamo T.
1132                         }
1133                         *starter=syllable;
1134                         // remove the Jamo V/T
1135                         q=pRemove;
1136                         r=p;
1137                         while(r<limit) {
1138                             *q++=*r++;
1139                         }
1140                         limit=q;
1141                         p=pRemove;
1142                     }
1143                 }
1144                 /*
1145                  * No "else" for Jamo T:
1146                  * Since the input is in NFD, there are no Hangul LV syllables that
1147                  * a Jamo T could combine with.
1148                  * All Jamo Ts are combined above when handling Jamo Vs.
1149                  */
1150                 if(p==limit) {
1151                     break;
1152                 }
1153                 compositionsList=NULL;
1154                 continue;
1155             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
1156                 // The starter and the combining mark (c) do combine.
1157                 UChar32 composite=compositeAndFwd>>1;
1158 
1159                 // Replace the starter with the composite, remove the combining mark.
1160                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
1161                 if(starterIsSupplementary) {
1162                     if(U_IS_SUPPLEMENTARY(composite)) {
1163                         // both are supplementary
1164                         starter[0]=U16_LEAD(composite);
1165                         starter[1]=U16_TRAIL(composite);
1166                     } else {
1167                         *starter=(UChar)composite;
1168                         // The composite is shorter than the starter,
1169                         // move the intermediate characters forward one.
1170                         starterIsSupplementary=FALSE;
1171                         q=starter+1;
1172                         r=q+1;
1173                         while(r<pRemove) {
1174                             *q++=*r++;
1175                         }
1176                         --pRemove;
1177                     }
1178                 } else if(U_IS_SUPPLEMENTARY(composite)) {
1179                     // The composite is longer than the starter,
1180                     // move the intermediate characters back one.
1181                     starterIsSupplementary=TRUE;
1182                     ++starter;  // temporarily increment for the loop boundary
1183                     q=pRemove;
1184                     r=++pRemove;
1185                     while(starter<q) {
1186                         *--r=*--q;
1187                     }
1188                     *starter=U16_TRAIL(composite);
1189                     *--starter=U16_LEAD(composite);  // undo the temporary increment
1190                 } else {
1191                     // both are on the BMP
1192                     *starter=(UChar)composite;
1193                 }
1194 
1195                 /* remove the combining mark by moving the following text over it */
1196                 if(pRemove<p) {
1197                     q=pRemove;
1198                     r=p;
1199                     while(r<limit) {
1200                         *q++=*r++;
1201                     }
1202                     limit=q;
1203                     p=pRemove;
1204                 }
1205                 // Keep prevCC because we removed the combining mark.
1206 
1207                 if(p==limit) {
1208                     break;
1209                 }
1210                 // Is the composite a starter that combines forward?
1211                 if(compositeAndFwd&1) {
1212                     compositionsList=
1213                         getCompositionsListForComposite(getRawNorm16(composite));
1214                 } else {
1215                     compositionsList=NULL;
1216                 }
1217 
1218                 // We combined; continue with looking for compositions.
1219                 continue;
1220             }
1221         }
1222 
1223         // no combination this time
1224         prevCC=cc;
1225         if(p==limit) {
1226             break;
1227         }
1228 
1229         // If c did not combine, then check if it is a starter.
1230         if(cc==0) {
1231             // Found a new starter.
1232             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1233                 // It may combine with something, prepare for it.
1234                 if(U_IS_BMP(c)) {
1235                     starterIsSupplementary=FALSE;
1236                     starter=p-1;
1237                 } else {
1238                     starterIsSupplementary=TRUE;
1239                     starter=p-2;
1240                 }
1241             }
1242         } else if(onlyContiguous) {
1243             // FCC: no discontiguous compositions; any intervening character blocks.
1244             compositionsList=NULL;
1245         }
1246     }
1247     buffer.setReorderingLimit(limit);
1248 }
1249 
1250 UChar32
composePair(UChar32 a,UChar32 b) const1251 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1252     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16
1253     const uint16_t *list;
1254     if(isInert(norm16)) {
1255         return U_SENTINEL;
1256     } else if(norm16<minYesNoMappingsOnly) {
1257         // a combines forward.
1258         if(isJamoL(norm16)) {
1259             b-=Hangul::JAMO_V_BASE;
1260             if(0<=b && b<Hangul::JAMO_V_COUNT) {
1261                 return
1262                     (Hangul::HANGUL_BASE+
1263                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1264                      Hangul::JAMO_T_COUNT);
1265             } else {
1266                 return U_SENTINEL;
1267             }
1268         } else if(isHangulLV(norm16)) {
1269             b-=Hangul::JAMO_T_BASE;
1270             if(0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
1271                 return a+b;
1272             } else {
1273                 return U_SENTINEL;
1274             }
1275         } else {
1276             // 'a' has a compositions list in extraData
1277             list=getMapping(norm16);
1278             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
1279                 list+=  // mapping pointer
1280                     1+  // +1 to skip the first unit with the mapping length
1281                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
1282             }
1283         }
1284     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1285         return U_SENTINEL;
1286     } else {
1287         list=getCompositionsListForMaybe(norm16);
1288     }
1289     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
1290         return U_SENTINEL;
1291     }
1292 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1293     return combine(list, b)>>1;
1294 #else
1295     int32_t compositeAndFwd=combine(list, b);
1296     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1297 #endif
1298 }
1299 
1300 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1301 // doCompose: normalize
1302 // !doCompose: isNormalized (buffer must be empty and initialized)
1303 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const1304 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1305                          UBool onlyContiguous,
1306                          UBool doCompose,
1307                          ReorderingBuffer &buffer,
1308                          UErrorCode &errorCode) const {
1309     const UChar *prevBoundary=src;
1310     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1311     if(limit==NULL) {
1312         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1313                                            doCompose ? &buffer : NULL,
1314                                            errorCode);
1315         if(U_FAILURE(errorCode)) {
1316             return FALSE;
1317         }
1318         limit=u_strchr(src, 0);
1319         if (prevBoundary != src) {
1320             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1321                 prevBoundary = src;
1322             } else {
1323                 buffer.removeSuffix(1);
1324                 prevBoundary = --src;
1325             }
1326         }
1327     }
1328 
1329     for (;;) {
1330         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1331         // or with (compYes && ccc==0) properties.
1332         const UChar *prevSrc;
1333         UChar32 c = 0;
1334         uint16_t norm16 = 0;
1335         for (;;) {
1336             if (src == limit) {
1337                 if (prevBoundary != limit && doCompose) {
1338                     buffer.appendZeroCC(prevBoundary, limit, errorCode);
1339                 }
1340                 return TRUE;
1341             }
1342             if( (c=*src)<minNoMaybeCP ||
1343                 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1344             ) {
1345                 ++src;
1346             } else {
1347                 prevSrc = src++;
1348                 if(!U16_IS_LEAD(c)) {
1349                     break;
1350                 } else {
1351                     UChar c2;
1352                     if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1353                         ++src;
1354                         c=U16_GET_SUPPLEMENTARY(c, c2);
1355                         norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1356                         if(!isCompYesAndZeroCC(norm16)) {
1357                             break;
1358                         }
1359                     }
1360                 }
1361             }
1362         }
1363         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1364         // The current character is either a "noNo" (has a mapping)
1365         // or a "maybeYes" (combines backward)
1366         // or a "yesYes" with ccc!=0.
1367         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1368 
1369         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1370         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1371             if (!doCompose) {
1372                 return FALSE;
1373             }
1374             // Fast path for mapping a character that is immediately surrounded by boundaries.
1375             // In this case, we need not decompose around the current character.
1376             if (isDecompNoAlgorithmic(norm16)) {
1377                 // Maps to a single isCompYesAndZeroCC character
1378                 // which also implies hasCompBoundaryBefore.
1379                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1380                         hasCompBoundaryBefore(src, limit)) {
1381                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1382                         break;
1383                     }
1384                     if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) {
1385                         break;
1386                     }
1387                     prevBoundary = src;
1388                     continue;
1389                 }
1390             } else if (norm16 < minNoNoCompBoundaryBefore) {
1391                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1392                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1393                         hasCompBoundaryBefore(src, limit)) {
1394                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1395                         break;
1396                     }
1397                     const UChar *mapping = reinterpret_cast<const UChar *>(getMapping(norm16));
1398                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1399                     if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {
1400                         break;
1401                     }
1402                     prevBoundary = src;
1403                     continue;
1404                 }
1405             } else if (norm16 >= minNoNoEmpty) {
1406                 // The current character maps to nothing.
1407                 // Simply omit it from the output if there is a boundary before _or_ after it.
1408                 // The character itself implies no boundaries.
1409                 if (hasCompBoundaryBefore(src, limit) ||
1410                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1411                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1412                         break;
1413                     }
1414                     prevBoundary = src;
1415                     continue;
1416                 }
1417             }
1418             // Other "noNo" type, or need to examine more text around this character:
1419             // Fall through to the slow path.
1420         } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1421             UChar prev=*(prevSrc-1);
1422             if(c<Hangul::JAMO_T_BASE) {
1423                 // The current character is a Jamo Vowel,
1424                 // compose with previous Jamo L and following Jamo T.
1425                 UChar l = (UChar)(prev-Hangul::JAMO_L_BASE);
1426                 if(l<Hangul::JAMO_L_COUNT) {
1427                     if (!doCompose) {
1428                         return FALSE;
1429                     }
1430                     int32_t t;
1431                     if (src != limit &&
1432                             0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) &&
1433                             t < Hangul::JAMO_T_COUNT) {
1434                         // The next character is a Jamo T.
1435                         ++src;
1436                     } else if (hasCompBoundaryBefore(src, limit)) {
1437                         // No Jamo T follows, not even via decomposition.
1438                         t = 0;
1439                     } else {
1440                         t = -1;
1441                     }
1442                     if (t >= 0) {
1443                         UChar32 syllable = Hangul::HANGUL_BASE +
1444                             (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *
1445                             Hangul::JAMO_T_COUNT + t;
1446                         --prevSrc;  // Replace the Jamo L as well.
1447                         if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1448                             break;
1449                         }
1450                         if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1451                             break;
1452                         }
1453                         prevBoundary = src;
1454                         continue;
1455                     }
1456                     // If we see L+V+x where x!=T then we drop to the slow path,
1457                     // decompose and recompose.
1458                     // This is to deal with NFKC finding normal L and V but a
1459                     // compatibility variant of a T.
1460                     // We need to either fully compose that combination here
1461                     // (which would complicate the code and may not work with strange custom data)
1462                     // or use the slow path.
1463                 }
1464             } else if (Hangul::isHangulLV(prev)) {
1465                 // The current character is a Jamo Trailing consonant,
1466                 // compose with previous Hangul LV that does not contain a Jamo T.
1467                 if (!doCompose) {
1468                     return FALSE;
1469                 }
1470                 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
1471                 --prevSrc;  // Replace the Hangul LV as well.
1472                 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1473                     break;
1474                 }
1475                 if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1476                     break;
1477                 }
1478                 prevBoundary = src;
1479                 continue;
1480             }
1481             // No matching context, or may need to decompose surrounding text first:
1482             // Fall through to the slow path.
1483         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1484             // One or more combining marks that do not combine-back:
1485             // Check for canonical order, copy unchanged if ok and
1486             // if followed by a character with a boundary-before.
1487             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1488             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1489                 // Fails FCD test, need to decompose and contiguously recompose.
1490                 if (!doCompose) {
1491                     return FALSE;
1492                 }
1493             } else {
1494                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1495                 // the previous character which passed the quick check "yes && ccc==0" test.
1496                 const UChar *nextSrc;
1497                 uint16_t n16;
1498                 for (;;) {
1499                     if (src == limit) {
1500                         if (doCompose) {
1501                             buffer.appendZeroCC(prevBoundary, limit, errorCode);
1502                         }
1503                         return TRUE;
1504                     }
1505                     uint8_t prevCC = cc;
1506                     nextSrc = src;
1507                     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16);
1508                     if (n16 >= MIN_YES_YES_WITH_CC) {
1509                         cc = getCCFromNormalYesOrMaybe(n16);
1510                         if (prevCC > cc) {
1511                             if (!doCompose) {
1512                                 return FALSE;
1513                             }
1514                             break;
1515                         }
1516                     } else {
1517                         break;
1518                     }
1519                     src = nextSrc;
1520                 }
1521                 // src is after the last in-order combining mark.
1522                 // If there is a boundary here, then we continue with no change.
1523                 if (norm16HasCompBoundaryBefore(n16)) {
1524                     if (isCompYesAndZeroCC(n16)) {
1525                         src = nextSrc;
1526                     }
1527                     continue;
1528                 }
1529                 // Use the slow path. There is no boundary in [prevSrc, src[.
1530             }
1531         }
1532 
1533         // Slow path: Find the nearest boundaries around the current character,
1534         // decompose and recompose.
1535         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1536             const UChar *p = prevSrc;
1537             UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16);
1538             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1539                 prevSrc = p;
1540             }
1541         }
1542         if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1543             break;
1544         }
1545         int32_t recomposeStartIndex=buffer.length();
1546         // We know there is not a boundary here.
1547         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1548                        buffer, errorCode);
1549         // Decompose until the next boundary.
1550         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1551                              buffer, errorCode);
1552         if (U_FAILURE(errorCode)) {
1553             break;
1554         }
1555         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1556             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1557             return TRUE;
1558         }
1559         recompose(buffer, recomposeStartIndex, onlyContiguous);
1560         if(!doCompose) {
1561             if(!buffer.equals(prevSrc, src)) {
1562                 return FALSE;
1563             }
1564             buffer.remove();
1565         }
1566         prevBoundary=src;
1567     }
1568     return TRUE;
1569 }
1570 
1571 // Very similar to compose(): Make the same changes in both places if relevant.
1572 // pQCResult==NULL: spanQuickCheckYes
1573 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1574 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1575 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1576                                    UBool onlyContiguous,
1577                                    UNormalizationCheckResult *pQCResult) const {
1578     const UChar *prevBoundary=src;
1579     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1580     if(limit==NULL) {
1581         UErrorCode errorCode=U_ZERO_ERROR;
1582         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1583         limit=u_strchr(src, 0);
1584         if (prevBoundary != src) {
1585             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1586                 prevBoundary = src;
1587             } else {
1588                 prevBoundary = --src;
1589             }
1590         }
1591     }
1592 
1593     for(;;) {
1594         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1595         // or with (compYes && ccc==0) properties.
1596         const UChar *prevSrc;
1597         UChar32 c = 0;
1598         uint16_t norm16 = 0;
1599         for (;;) {
1600             if(src==limit) {
1601                 return src;
1602             }
1603             if( (c=*src)<minNoMaybeCP ||
1604                 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1605             ) {
1606                 ++src;
1607             } else {
1608                 prevSrc = src++;
1609                 if(!U16_IS_LEAD(c)) {
1610                     break;
1611                 } else {
1612                     UChar c2;
1613                     if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1614                         ++src;
1615                         c=U16_GET_SUPPLEMENTARY(c, c2);
1616                         norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1617                         if(!isCompYesAndZeroCC(norm16)) {
1618                             break;
1619                         }
1620                     }
1621                 }
1622             }
1623         }
1624         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1625         // The current character is either a "noNo" (has a mapping)
1626         // or a "maybeYes" (combines backward)
1627         // or a "yesYes" with ccc!=0.
1628         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1629 
1630         uint16_t prevNorm16 = INERT;
1631         if (prevBoundary != prevSrc) {
1632             if (norm16HasCompBoundaryBefore(norm16)) {
1633                 prevBoundary = prevSrc;
1634             } else {
1635                 const UChar *p = prevSrc;
1636                 uint16_t n16;
1637                 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16);
1638                 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1639                     prevBoundary = prevSrc;
1640                 } else {
1641                     prevBoundary = p;
1642                     prevNorm16 = n16;
1643                 }
1644             }
1645         }
1646 
1647         if(isMaybeOrNonZeroCC(norm16)) {
1648             uint8_t cc=getCCFromYesOrMaybe(norm16);
1649             if (onlyContiguous /* FCC */ && cc != 0 &&
1650                     getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1651                 // The [prevBoundary..prevSrc[ character
1652                 // passed the quick check "yes && ccc==0" test
1653                 // but is out of canonical order with the current combining mark.
1654             } else {
1655                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1656                 // the previous character which passed the quick check "yes && ccc==0" test.
1657                 const UChar *nextSrc;
1658                 for (;;) {
1659                     if (norm16 < MIN_YES_YES_WITH_CC) {
1660                         if (pQCResult != nullptr) {
1661                             *pQCResult = UNORM_MAYBE;
1662                         } else {
1663                             return prevBoundary;
1664                         }
1665                     }
1666                     if (src == limit) {
1667                         return src;
1668                     }
1669                     uint8_t prevCC = cc;
1670                     nextSrc = src;
1671                     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16);
1672                     if (isMaybeOrNonZeroCC(norm16)) {
1673                         cc = getCCFromYesOrMaybe(norm16);
1674                         if (!(prevCC <= cc || cc == 0)) {
1675                             break;
1676                         }
1677                     } else {
1678                         break;
1679                     }
1680                     src = nextSrc;
1681                 }
1682                 // src is after the last in-order combining mark.
1683                 if (isCompYesAndZeroCC(norm16)) {
1684                     prevBoundary = src;
1685                     src = nextSrc;
1686                     continue;
1687                 }
1688             }
1689         }
1690         if(pQCResult!=NULL) {
1691             *pQCResult=UNORM_NO;
1692         }
1693         return prevBoundary;
1694     }
1695 }
1696 
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1697 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1698                                        UBool doCompose,
1699                                        UBool onlyContiguous,
1700                                        UnicodeString &safeMiddle,
1701                                        ReorderingBuffer &buffer,
1702                                        UErrorCode &errorCode) const {
1703     if(!buffer.isEmpty()) {
1704         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);
1705         if(src!=firstStarterInSrc) {
1706             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1707                                                                     buffer.getLimit(), onlyContiguous);
1708             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1709             UnicodeString middle(lastStarterInDest, destSuffixLength);
1710             buffer.removeSuffix(destSuffixLength);
1711             safeMiddle=middle;
1712             middle.append(src, (int32_t)(firstStarterInSrc-src));
1713             const UChar *middleStart=middle.getBuffer();
1714             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1715                     TRUE, buffer, errorCode);
1716             if(U_FAILURE(errorCode)) {
1717                 return;
1718             }
1719             src=firstStarterInSrc;
1720         }
1721     }
1722     if(doCompose) {
1723         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1724     } else {
1725         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1726             limit=u_strchr(src, 0);
1727         }
1728         buffer.appendZeroCC(src, limit, errorCode);
1729     }
1730 }
1731 
1732 UBool
composeUTF8(uint32_t options,UBool onlyContiguous,const uint8_t * src,const uint8_t * limit,ByteSink * sink,Edits * edits,UErrorCode & errorCode) const1733 Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
1734                              const uint8_t *src, const uint8_t *limit,
1735                              ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
1736     U_ASSERT(limit != nullptr);
1737     UnicodeString s16;
1738     uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
1739     const uint8_t *prevBoundary = src;
1740 
1741     for (;;) {
1742         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1743         // or with (compYes && ccc==0) properties.
1744         const uint8_t *prevSrc;
1745         uint16_t norm16 = 0;
1746         for (;;) {
1747             if (src == limit) {
1748                 if (prevBoundary != limit && sink != nullptr) {
1749                     ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1750                                                   *sink, options, edits, errorCode);
1751                 }
1752                 return TRUE;
1753             }
1754             if (*src < minNoMaybeLead) {
1755                 ++src;
1756             } else {
1757                 prevSrc = src;
1758                 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
1759                 if (!isCompYesAndZeroCC(norm16)) {
1760                     break;
1761                 }
1762             }
1763         }
1764         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1765         // The current character is either a "noNo" (has a mapping)
1766         // or a "maybeYes" (combines backward)
1767         // or a "yesYes" with ccc!=0.
1768         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1769 
1770         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1771         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1772             if (sink == nullptr) {
1773                 return FALSE;
1774             }
1775             // Fast path for mapping a character that is immediately surrounded by boundaries.
1776             // In this case, we need not decompose around the current character.
1777             if (isDecompNoAlgorithmic(norm16)) {
1778                 // Maps to a single isCompYesAndZeroCC character
1779                 // which also implies hasCompBoundaryBefore.
1780                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1781                         hasCompBoundaryBefore(src, limit)) {
1782                     if (prevBoundary != prevSrc &&
1783                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1784                                                            *sink, options, edits, errorCode)) {
1785                         break;
1786                     }
1787                     appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
1788                     prevBoundary = src;
1789                     continue;
1790                 }
1791             } else if (norm16 < minNoNoCompBoundaryBefore) {
1792                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1793                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1794                         hasCompBoundaryBefore(src, limit)) {
1795                     if (prevBoundary != prevSrc &&
1796                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1797                                                            *sink, options, edits, errorCode)) {
1798                         break;
1799                     }
1800                     const uint16_t *mapping = getMapping(norm16);
1801                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1802                     if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,
1803                                                     *sink, edits, errorCode)) {
1804                         break;
1805                     }
1806                     prevBoundary = src;
1807                     continue;
1808                 }
1809             } else if (norm16 >= minNoNoEmpty) {
1810                 // The current character maps to nothing.
1811                 // Simply omit it from the output if there is a boundary before _or_ after it.
1812                 // The character itself implies no boundaries.
1813                 if (hasCompBoundaryBefore(src, limit) ||
1814                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1815                     if (prevBoundary != prevSrc &&
1816                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1817                                                            *sink, options, edits, errorCode)) {
1818                         break;
1819                     }
1820                     if (edits != nullptr) {
1821                         edits->addReplace((int32_t)(src - prevSrc), 0);
1822                     }
1823                     prevBoundary = src;
1824                     continue;
1825                 }
1826             }
1827             // Other "noNo" type, or need to examine more text around this character:
1828             // Fall through to the slow path.
1829         } else if (isJamoVT(norm16)) {
1830             // Jamo L: E1 84 80..92
1831             // Jamo V: E1 85 A1..B5
1832             // Jamo T: E1 86 A8..E1 87 82
1833             U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1);
1834             UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);
1835             if (prevSrc[1] == 0x85) {
1836                 // The current character is a Jamo Vowel,
1837                 // compose with previous Jamo L and following Jamo T.
1838                 UChar32 l = prev - Hangul::JAMO_L_BASE;
1839                 if ((uint32_t)l < Hangul::JAMO_L_COUNT) {
1840                     if (sink == nullptr) {
1841                         return FALSE;
1842                     }
1843                     int32_t t = getJamoTMinusBase(src, limit);
1844                     if (t >= 0) {
1845                         // The next character is a Jamo T.
1846                         src += 3;
1847                     } else if (hasCompBoundaryBefore(src, limit)) {
1848                         // No Jamo T follows, not even via decomposition.
1849                         t = 0;
1850                     }
1851                     if (t >= 0) {
1852                         UChar32 syllable = Hangul::HANGUL_BASE +
1853                             (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *
1854                             Hangul::JAMO_T_COUNT + t;
1855                         prevSrc -= 3;  // Replace the Jamo L as well.
1856                         if (prevBoundary != prevSrc &&
1857                                 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1858                                                                *sink, options, edits, errorCode)) {
1859                             break;
1860                         }
1861                         ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1862                         prevBoundary = src;
1863                         continue;
1864                     }
1865                     // If we see L+V+x where x!=T then we drop to the slow path,
1866                     // decompose and recompose.
1867                     // This is to deal with NFKC finding normal L and V but a
1868                     // compatibility variant of a T.
1869                     // We need to either fully compose that combination here
1870                     // (which would complicate the code and may not work with strange custom data)
1871                     // or use the slow path.
1872                 }
1873             } else if (Hangul::isHangulLV(prev)) {
1874                 // The current character is a Jamo Trailing consonant,
1875                 // compose with previous Hangul LV that does not contain a Jamo T.
1876                 if (sink == nullptr) {
1877                     return FALSE;
1878                 }
1879                 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
1880                 prevSrc -= 3;  // Replace the Hangul LV as well.
1881                 if (prevBoundary != prevSrc &&
1882                         !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1883                                                        *sink, options, edits, errorCode)) {
1884                     break;
1885                 }
1886                 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1887                 prevBoundary = src;
1888                 continue;
1889             }
1890             // No matching context, or may need to decompose surrounding text first:
1891             // Fall through to the slow path.
1892         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1893             // One or more combining marks that do not combine-back:
1894             // Check for canonical order, copy unchanged if ok and
1895             // if followed by a character with a boundary-before.
1896             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1897             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1898                 // Fails FCD test, need to decompose and contiguously recompose.
1899                 if (sink == nullptr) {
1900                     return FALSE;
1901                 }
1902             } else {
1903                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1904                 // the previous character which passed the quick check "yes && ccc==0" test.
1905                 const uint8_t *nextSrc;
1906                 uint16_t n16;
1907                 for (;;) {
1908                     if (src == limit) {
1909                         if (sink != nullptr) {
1910                             ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1911                                                           *sink, options, edits, errorCode);
1912                         }
1913                         return TRUE;
1914                     }
1915                     uint8_t prevCC = cc;
1916                     nextSrc = src;
1917                     UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16);
1918                     if (n16 >= MIN_YES_YES_WITH_CC) {
1919                         cc = getCCFromNormalYesOrMaybe(n16);
1920                         if (prevCC > cc) {
1921                             if (sink == nullptr) {
1922                                 return FALSE;
1923                             }
1924                             break;
1925                         }
1926                     } else {
1927                         break;
1928                     }
1929                     src = nextSrc;
1930                 }
1931                 // src is after the last in-order combining mark.
1932                 // If there is a boundary here, then we continue with no change.
1933                 if (norm16HasCompBoundaryBefore(n16)) {
1934                     if (isCompYesAndZeroCC(n16)) {
1935                         src = nextSrc;
1936                     }
1937                     continue;
1938                 }
1939                 // Use the slow path. There is no boundary in [prevSrc, src[.
1940             }
1941         }
1942 
1943         // Slow path: Find the nearest boundaries around the current character,
1944         // decompose and recompose.
1945         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1946             const uint8_t *p = prevSrc;
1947             UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16);
1948             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1949                 prevSrc = p;
1950             }
1951         }
1952         ReorderingBuffer buffer(*this, s16, errorCode);
1953         if (U_FAILURE(errorCode)) {
1954             break;
1955         }
1956         // We know there is not a boundary here.
1957         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1958                        buffer, errorCode);
1959         // Decompose until the next boundary.
1960         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1961                              buffer, errorCode);
1962         if (U_FAILURE(errorCode)) {
1963             break;
1964         }
1965         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1966             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1967             return TRUE;
1968         }
1969         recompose(buffer, 0, onlyContiguous);
1970         if (!buffer.equals(prevSrc, src)) {
1971             if (sink == nullptr) {
1972                 return FALSE;
1973             }
1974             if (prevBoundary != prevSrc &&
1975                     !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1976                                                    *sink, options, edits, errorCode)) {
1977                 break;
1978             }
1979             if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
1980                                             *sink, edits, errorCode)) {
1981                 break;
1982             }
1983             prevBoundary = src;
1984         }
1985     }
1986     return TRUE;
1987 }
1988 
hasCompBoundaryBefore(const UChar * src,const UChar * limit) const1989 UBool Normalizer2Impl::hasCompBoundaryBefore(const UChar *src, const UChar *limit) const {
1990     if (src == limit || *src < minCompNoMaybeCP) {
1991         return TRUE;
1992     }
1993     UChar32 c;
1994     uint16_t norm16;
1995     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
1996     return norm16HasCompBoundaryBefore(norm16);
1997 }
1998 
hasCompBoundaryBefore(const uint8_t * src,const uint8_t * limit) const1999 UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {
2000     if (src == limit) {
2001         return TRUE;
2002     }
2003     uint16_t norm16;
2004     UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
2005     return norm16HasCompBoundaryBefore(norm16);
2006 }
2007 
hasCompBoundaryAfter(const UChar * start,const UChar * p,UBool onlyContiguous) const2008 UBool Normalizer2Impl::hasCompBoundaryAfter(const UChar *start, const UChar *p,
2009                                             UBool onlyContiguous) const {
2010     if (start == p) {
2011         return TRUE;
2012     }
2013     UChar32 c;
2014     uint16_t norm16;
2015     UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2016     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2017 }
2018 
hasCompBoundaryAfter(const uint8_t * start,const uint8_t * p,UBool onlyContiguous) const2019 UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
2020                                             UBool onlyContiguous) const {
2021     if (start == p) {
2022         return TRUE;
2023     }
2024     uint16_t norm16;
2025     UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16);
2026     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2027 }
2028 
findPreviousCompBoundary(const UChar * start,const UChar * p,UBool onlyContiguous) const2029 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p,
2030                                                        UBool onlyContiguous) const {
2031     while (p != start) {
2032         const UChar *codePointLimit = p;
2033         UChar32 c;
2034         uint16_t norm16;
2035         UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2036         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2037             return codePointLimit;
2038         }
2039         if (hasCompBoundaryBefore(c, norm16)) {
2040             return p;
2041         }
2042     }
2043     return p;
2044 }
2045 
findNextCompBoundary(const UChar * p,const UChar * limit,UBool onlyContiguous) const2046 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit,
2047                                                    UBool onlyContiguous) const {
2048     while (p != limit) {
2049         const UChar *codePointStart = p;
2050         UChar32 c;
2051         uint16_t norm16;
2052         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2053         if (hasCompBoundaryBefore(c, norm16)) {
2054             return codePointStart;
2055         }
2056         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2057             return p;
2058         }
2059     }
2060     return p;
2061 }
2062 
getPreviousTrailCC(const UChar * start,const UChar * p) const2063 uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar *start, const UChar *p) const {
2064     if (start == p) {
2065         return 0;
2066     }
2067     int32_t i = (int32_t)(p - start);
2068     UChar32 c;
2069     U16_PREV(start, 0, i, c);
2070     return (uint8_t)getFCD16(c);
2071 }
2072 
getPreviousTrailCC(const uint8_t * start,const uint8_t * p) const2073 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const {
2074     if (start == p) {
2075         return 0;
2076     }
2077     int32_t i = (int32_t)(p - start);
2078     UChar32 c;
2079     U8_PREV(start, 0, i, c);
2080     return (uint8_t)getFCD16(c);
2081 }
2082 
2083 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
2084 // still had getFCDTrie() which built and cached an FCD trie.
2085 // That provided faster access to FCD data than getFCD16FromNormData()
2086 // but required synchronization and consumed some 10kB of heap memory
2087 // in any process that uses FCD (e.g., via collation).
2088 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2089 // at least for ASCII & CJK.
2090 
2091 // Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this
2092 // function on Windows ARM64. As a work-around, we disable optimizations for this function.
2093 // This work-around could/should be removed once the following versions of Visual Studio are no
2094 // longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
2095 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2096 #pragma optimize( "", off )
2097 #endif
2098 // Gets the FCD value from the regular normalization data.
getFCD16FromNormData(UChar32 c) const2099 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
2100     uint16_t norm16=getNorm16(c);
2101     if (norm16 >= limitNoNo) {
2102         if(norm16>=MIN_NORMAL_MAYBE_YES) {
2103             // combining mark
2104             norm16=getCCFromNormalYesOrMaybe(norm16);
2105             return norm16|(norm16<<8);
2106         } else if(norm16>=minMaybeYes) {
2107             return 0;
2108         } else {  // isDecompNoAlgorithmic(norm16)
2109             uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;
2110             if (deltaTrailCC <= DELTA_TCCC_1) {
2111                 return deltaTrailCC >> OFFSET_SHIFT;
2112             }
2113             // Maps to an isCompYesAndZeroCC.
2114             c=mapAlgorithmic(c, norm16);
2115             norm16=getRawNorm16(c);
2116         }
2117     }
2118     if(norm16<=minYesNo || isHangulLVT(norm16)) {
2119         // no decomposition or Hangul syllable, all zeros
2120         return 0;
2121     }
2122     // c decomposes, get everything from the variable-length extra data
2123     const uint16_t *mapping=getMapping(norm16);
2124     uint16_t firstUnit=*mapping;
2125     norm16=firstUnit>>8;  // tccc
2126     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
2127         norm16|=*(mapping-1)&0xff00;  // lccc
2128     }
2129     return norm16;
2130 }
2131 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2132 #pragma optimize( "", on )
2133 #endif
2134 
2135 // Dual functionality:
2136 // buffer!=NULL: normalize
2137 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
2138 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const2139 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
2140                          ReorderingBuffer *buffer,
2141                          UErrorCode &errorCode) const {
2142     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2143     // Similar to the prevBoundary in the compose() implementation.
2144     const UChar *prevBoundary=src;
2145     int32_t prevFCD16=0;
2146     if(limit==NULL) {
2147         src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);
2148         if(U_FAILURE(errorCode)) {
2149             return src;
2150         }
2151         if(prevBoundary<src) {
2152             prevBoundary=src;
2153             // We know that the previous character's lccc==0.
2154             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2155             prevFCD16=getFCD16(*(src-1));
2156             if(prevFCD16>1) {
2157                 --prevBoundary;
2158             }
2159         }
2160         limit=u_strchr(src, 0);
2161     }
2162 
2163     // Note: In this function we use buffer->appendZeroCC() because we track
2164     // the lead and trail combining classes here, rather than leaving it to
2165     // the ReorderingBuffer.
2166     // The exception is the call to decomposeShort() which uses the buffer
2167     // in the normal way.
2168 
2169     const UChar *prevSrc;
2170     UChar32 c=0;
2171     uint16_t fcd16=0;
2172 
2173     for(;;) {
2174         // count code units with lccc==0
2175         for(prevSrc=src; src!=limit;) {
2176             if((c=*src)<minLcccCP) {
2177                 prevFCD16=~c;
2178                 ++src;
2179             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
2180                 prevFCD16=0;
2181                 ++src;
2182             } else {
2183                 if(U16_IS_LEAD(c)) {
2184                     UChar c2;
2185                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
2186                         c=U16_GET_SUPPLEMENTARY(c, c2);
2187                     }
2188                 }
2189                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
2190                     prevFCD16=fcd16;
2191                     src+=U16_LENGTH(c);
2192                 } else {
2193                     break;
2194                 }
2195             }
2196         }
2197         // copy these code units all at once
2198         if(src!=prevSrc) {
2199             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
2200                 break;
2201             }
2202             if(src==limit) {
2203                 break;
2204             }
2205             prevBoundary=src;
2206             // We know that the previous character's lccc==0.
2207             if(prevFCD16<0) {
2208                 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2209                 UChar32 prev=~prevFCD16;
2210                 if(prev<minDecompNoCP) {
2211                     prevFCD16=0;
2212                 } else {
2213                     prevFCD16=getFCD16FromNormData(prev);
2214                     if(prevFCD16>1) {
2215                         --prevBoundary;
2216                     }
2217                 }
2218             } else {
2219                 const UChar *p=src-1;
2220                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
2221                     --p;
2222                     // Need to fetch the previous character's FCD value because
2223                     // prevFCD16 was just for the trail surrogate code point.
2224                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
2225                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2226                 }
2227                 if(prevFCD16>1) {
2228                     prevBoundary=p;
2229                 }
2230             }
2231             // The start of the current character (c).
2232             prevSrc=src;
2233         } else if(src==limit) {
2234             break;
2235         }
2236 
2237         src+=U16_LENGTH(c);
2238         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2239         // Check for proper order, and decompose locally if necessary.
2240         if((prevFCD16&0xff)<=(fcd16>>8)) {
2241             // proper order: prev tccc <= current lccc
2242             if((fcd16&0xff)<=1) {
2243                 prevBoundary=src;
2244             }
2245             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
2246                 break;
2247             }
2248             prevFCD16=fcd16;
2249             continue;
2250         } else if(buffer==NULL) {
2251             return prevBoundary;  // quick check "no"
2252         } else {
2253             /*
2254              * Back out the part of the source that we copied or appended
2255              * already but is now going to be decomposed.
2256              * prevSrc is set to after what was copied/appended.
2257              */
2258             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
2259             /*
2260              * Find the part of the source that needs to be decomposed,
2261              * up to the next safe boundary.
2262              */
2263             src=findNextFCDBoundary(src, limit);
2264             /*
2265              * The source text does not fulfill the conditions for FCD.
2266              * Decompose and reorder a limited piece of the text.
2267              */
2268             decomposeShort(prevBoundary, src, FALSE, FALSE, *buffer, errorCode);
2269             if (U_FAILURE(errorCode)) {
2270                 break;
2271             }
2272             prevBoundary=src;
2273             prevFCD16=0;
2274         }
2275     }
2276     return src;
2277 }
2278 
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const2279 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
2280                                        UBool doMakeFCD,
2281                                        UnicodeString &safeMiddle,
2282                                        ReorderingBuffer &buffer,
2283                                        UErrorCode &errorCode) const {
2284     if(!buffer.isEmpty()) {
2285         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
2286         if(src!=firstBoundaryInSrc) {
2287             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
2288                                                                     buffer.getLimit());
2289             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
2290             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
2291             buffer.removeSuffix(destSuffixLength);
2292             safeMiddle=middle;
2293             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
2294             const UChar *middleStart=middle.getBuffer();
2295             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
2296             if(U_FAILURE(errorCode)) {
2297                 return;
2298             }
2299             src=firstBoundaryInSrc;
2300         }
2301     }
2302     if(doMakeFCD) {
2303         makeFCD(src, limit, &buffer, errorCode);
2304     } else {
2305         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
2306             limit=u_strchr(src, 0);
2307         }
2308         buffer.appendZeroCC(src, limit, errorCode);
2309     }
2310 }
2311 
findPreviousFCDBoundary(const UChar * start,const UChar * p) const2312 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
2313     while(start<p) {
2314         const UChar *codePointLimit = p;
2315         UChar32 c;
2316         uint16_t norm16;
2317         UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2318         if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {
2319             return codePointLimit;
2320         }
2321         if (norm16HasDecompBoundaryBefore(norm16)) {
2322             return p;
2323         }
2324     }
2325     return p;
2326 }
2327 
findNextFCDBoundary(const UChar * p,const UChar * limit) const2328 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
2329     while(p<limit) {
2330         const UChar *codePointStart=p;
2331         UChar32 c;
2332         uint16_t norm16;
2333         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2334         if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {
2335             return codePointStart;
2336         }
2337         if (norm16HasDecompBoundaryAfter(norm16)) {
2338             return p;
2339         }
2340     }
2341     return p;
2342 }
2343 
2344 // CanonicalIterator data -------------------------------------------------- ***
2345 
CanonIterData(UErrorCode & errorCode)2346 CanonIterData::CanonIterData(UErrorCode &errorCode) :
2347         mutableTrie(umutablecptrie_open(0, 0, &errorCode)), trie(nullptr),
2348         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
2349 
~CanonIterData()2350 CanonIterData::~CanonIterData() {
2351     umutablecptrie_close(mutableTrie);
2352     ucptrie_close(trie);
2353 }
2354 
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)2355 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
2356     uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead);
2357     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2358         // origin is the first character whose decomposition starts with
2359         // the character for which we are setting the value.
2360         umutablecptrie_set(mutableTrie, decompLead, canonValue|origin, &errorCode);
2361     } else {
2362         // origin is not the first character, or it is U+0000.
2363         UnicodeSet *set;
2364         if((canonValue&CANON_HAS_SET)==0) {
2365             set=new UnicodeSet;
2366             if(set==NULL) {
2367                 errorCode=U_MEMORY_ALLOCATION_ERROR;
2368                 return;
2369             }
2370             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
2371             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
2372             umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
2373             canonStartSets.addElement(set, errorCode);
2374             if(firstOrigin!=0) {
2375                 set->add(firstOrigin);
2376             }
2377         } else {
2378             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
2379         }
2380         set->add(origin);
2381     }
2382 }
2383 
2384 // C++ class for friend access to private Normalizer2Impl members.
2385 class InitCanonIterData {
2386 public:
2387     static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
2388 };
2389 
2390 U_CDECL_BEGIN
2391 
2392 // UInitOnce instantiation function for CanonIterData
2393 static void U_CALLCONV
initCanonIterData(Normalizer2Impl * impl,UErrorCode & errorCode)2394 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
2395     InitCanonIterData::doInit(impl, errorCode);
2396 }
2397 
2398 U_CDECL_END
2399 
doInit(Normalizer2Impl * impl,UErrorCode & errorCode)2400 void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
2401     U_ASSERT(impl->fCanonIterData == NULL);
2402     impl->fCanonIterData = new CanonIterData(errorCode);
2403     if (impl->fCanonIterData == NULL) {
2404         errorCode=U_MEMORY_ALLOCATION_ERROR;
2405     }
2406     if (U_SUCCESS(errorCode)) {
2407         UChar32 start = 0, end;
2408         uint32_t value;
2409         while ((end = ucptrie_getRange(impl->normTrie, start,
2410                                        UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
2411                                        nullptr, nullptr, &value)) >= 0) {
2412             // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2413             if (value != Normalizer2Impl::INERT) {
2414                 impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
2415             }
2416             start = end + 1;
2417         }
2418 #ifdef UCPTRIE_DEBUG
2419         umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");
2420 #endif
2421         impl->fCanonIterData->trie = umutablecptrie_buildImmutable(
2422             impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);
2423         umutablecptrie_close(impl->fCanonIterData->mutableTrie);
2424         impl->fCanonIterData->mutableTrie = nullptr;
2425     }
2426     if (U_FAILURE(errorCode)) {
2427         delete impl->fCanonIterData;
2428         impl->fCanonIterData = NULL;
2429     }
2430 }
2431 
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,const uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const2432 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
2433                                                   CanonIterData &newData,
2434                                                   UErrorCode &errorCode) const {
2435     if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
2436         // Inert, or 2-way mapping (including Hangul syllable).
2437         // We do not write a canonStartSet for any yesNo character.
2438         // Composites from 2-way mappings are added at runtime from the
2439         // starter's compositions list, and the other characters in
2440         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2441         // "maybe" characters.
2442         return;
2443     }
2444     for(UChar32 c=start; c<=end; ++c) {
2445         uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c);
2446         uint32_t newValue=oldValue;
2447         if(isMaybeOrNonZeroCC(norm16)) {
2448             // not a segment starter if it occurs in a decomposition or has cc!=0
2449             newValue|=CANON_NOT_SEGMENT_STARTER;
2450             if(norm16<MIN_NORMAL_MAYBE_YES) {
2451                 newValue|=CANON_HAS_COMPOSITIONS;
2452             }
2453         } else if(norm16<minYesNo) {
2454             newValue|=CANON_HAS_COMPOSITIONS;
2455         } else {
2456             // c has a one-way decomposition
2457             UChar32 c2=c;
2458             // Do not modify the whole-range norm16 value.
2459             uint16_t norm16_2=norm16;
2460             if (isDecompNoAlgorithmic(norm16_2)) {
2461                 // Maps to an isCompYesAndZeroCC.
2462                 c2 = mapAlgorithmic(c2, norm16_2);
2463                 norm16_2 = getRawNorm16(c2);
2464                 // No compatibility mappings for the CanonicalIterator.
2465                 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
2466             }
2467             if (norm16_2 > minYesNo) {
2468                 // c decomposes, get everything from the variable-length extra data
2469                 const uint16_t *mapping=getMapping(norm16_2);
2470                 uint16_t firstUnit=*mapping;
2471                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
2472                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
2473                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
2474                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
2475                     }
2476                 }
2477                 // Skip empty mappings (no characters in the decomposition).
2478                 if(length!=0) {
2479                     ++mapping;  // skip over the firstUnit
2480                     // add c to first code point's start set
2481                     int32_t i=0;
2482                     U16_NEXT_UNSAFE(mapping, i, c2);
2483                     newData.addToStartSet(c, c2, errorCode);
2484                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2485                     // one-way mapping. A 2-way mapping is possible here after
2486                     // intermediate algorithmic mapping.
2487                     if(norm16_2>=minNoNo) {
2488                         while(i<length) {
2489                             U16_NEXT_UNSAFE(mapping, i, c2);
2490                             uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2);
2491                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
2492                                 umutablecptrie_set(newData.mutableTrie, c2,
2493                                                    c2Value|CANON_NOT_SEGMENT_STARTER, &errorCode);
2494                             }
2495                         }
2496                     }
2497                 }
2498             } else {
2499                 // c decomposed to c2 algorithmically; c has cc==0
2500                 newData.addToStartSet(c, c2, errorCode);
2501             }
2502         }
2503         if(newValue!=oldValue) {
2504             umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode);
2505         }
2506     }
2507 }
2508 
ensureCanonIterData(UErrorCode & errorCode) const2509 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
2510     // Logically const: Synchronized instantiation.
2511     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
2512     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
2513     return U_SUCCESS(errorCode);
2514 }
2515 
getCanonValue(UChar32 c) const2516 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
2517     return (int32_t)ucptrie_get(fCanonIterData->trie, c);
2518 }
2519 
getCanonStartSet(int32_t n) const2520 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
2521     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
2522 }
2523 
isCanonSegmentStarter(UChar32 c) const2524 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
2525     return getCanonValue(c)>=0;
2526 }
2527 
getCanonStartSet(UChar32 c,UnicodeSet & set) const2528 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
2529     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
2530     if(canonValue==0) {
2531         return FALSE;
2532     }
2533     set.clear();
2534     int32_t value=canonValue&CANON_VALUE_MASK;
2535     if((canonValue&CANON_HAS_SET)!=0) {
2536         set.addAll(getCanonStartSet(value));
2537     } else if(value!=0) {
2538         set.add(value);
2539     }
2540     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
2541         uint16_t norm16=getRawNorm16(c);
2542         if(norm16==JAMO_L) {
2543             UChar32 syllable=
2544                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
2545             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
2546         } else {
2547             addComposites(getCompositionsList(norm16), set);
2548         }
2549     }
2550     return TRUE;
2551 }
2552 
2553 U_NAMESPACE_END
2554 
2555 // Normalizer2 data swapping ----------------------------------------------- ***
2556 
2557 U_NAMESPACE_USE
2558 
2559 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)2560 unorm2_swap(const UDataSwapper *ds,
2561             const void *inData, int32_t length, void *outData,
2562             UErrorCode *pErrorCode) {
2563     const UDataInfo *pInfo;
2564     int32_t headerSize;
2565 
2566     const uint8_t *inBytes;
2567     uint8_t *outBytes;
2568 
2569     const int32_t *inIndexes;
2570     int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];
2571 
2572     int32_t i, offset, nextOffset, size;
2573 
2574     /* udata_swapDataHeader checks the arguments */
2575     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2576     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2577         return 0;
2578     }
2579 
2580     /* check data format and format version */
2581     pInfo=(const UDataInfo *)((const char *)inData+4);
2582     uint8_t formatVersion0=pInfo->formatVersion[0];
2583     if(!(
2584         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
2585         pInfo->dataFormat[1]==0x72 &&
2586         pInfo->dataFormat[2]==0x6d &&
2587         pInfo->dataFormat[3]==0x32 &&
2588         (1<=formatVersion0 && formatVersion0<=4)
2589     )) {
2590         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2591                          pInfo->dataFormat[0], pInfo->dataFormat[1],
2592                          pInfo->dataFormat[2], pInfo->dataFormat[3],
2593                          pInfo->formatVersion[0]);
2594         *pErrorCode=U_UNSUPPORTED_ERROR;
2595         return 0;
2596     }
2597 
2598     inBytes=(const uint8_t *)inData+headerSize;
2599     outBytes=(uint8_t *)outData+headerSize;
2600 
2601     inIndexes=(const int32_t *)inBytes;
2602     int32_t minIndexesLength;
2603     if(formatVersion0==1) {
2604         minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;
2605     } else if(formatVersion0==2) {
2606         minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;
2607     } else {
2608         minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;
2609     }
2610 
2611     if(length>=0) {
2612         length-=headerSize;
2613         if(length<minIndexesLength*4) {
2614             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2615                              length);
2616             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2617             return 0;
2618         }
2619     }
2620 
2621     /* read the first few indexes */
2622     for(i=0; i<UPRV_LENGTHOF(indexes); ++i) {
2623         indexes[i]=udata_readInt32(ds, inIndexes[i]);
2624     }
2625 
2626     /* get the total length of the data */
2627     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2628 
2629     if(length>=0) {
2630         if(length<size) {
2631             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2632                              length);
2633             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2634             return 0;
2635         }
2636 
2637         /* copy the data for inaccessible bytes */
2638         if(inBytes!=outBytes) {
2639             uprv_memcpy(outBytes, inBytes, size);
2640         }
2641 
2642         offset=0;
2643 
2644         /* swap the int32_t indexes[] */
2645         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2646         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2647         offset=nextOffset;
2648 
2649         /* swap the trie */
2650         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2651         utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2652         offset=nextOffset;
2653 
2654         /* swap the uint16_t extraData[] */
2655         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2656         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2657         offset=nextOffset;
2658 
2659         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2660         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2661         offset=nextOffset;
2662 
2663         U_ASSERT(offset==size);
2664     }
2665 
2666     return headerSize+size;
2667 }
2668 
2669 #endif  // !UCONFIG_NO_NORMALIZATION
2670