1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: normalizer2impl.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov22
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/normalizer2.h"
22 #include "unicode/udata.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
25 #include "cmemory.h"
26 #include "mutex.h"
27 #include "normalizer2impl.h"
28 #include "putilimp.h"
29 #include "uassert.h"
30 #include "uset_imp.h"
31 #include "utrie2.h"
32 #include "uvector.h"
33
34 U_NAMESPACE_BEGIN
35
36 // ReorderingBuffer -------------------------------------------------------- ***
37
init(int32_t destCapacity,UErrorCode & errorCode)38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
39 int32_t length=str.length();
40 start=str.getBuffer(destCapacity);
41 if(start==NULL) {
42 // getBuffer() already did str.setToBogus()
43 errorCode=U_MEMORY_ALLOCATION_ERROR;
44 return FALSE;
45 }
46 limit=start+length;
47 remainingCapacity=str.getCapacity()-length;
48 reorderStart=start;
49 if(start==limit) {
50 lastCC=0;
51 } else {
52 setIterator();
53 lastCC=previousCC();
54 // Set reorderStart after the last code point with cc<=1 if there is one.
55 if(lastCC>1) {
56 while(previousCC()>1) {}
57 }
58 reorderStart=codePointLimit;
59 }
60 return TRUE;
61 }
62
equals(const UChar * otherStart,const UChar * otherLimit) const63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
64 int32_t length=(int32_t)(limit-start);
65 return
66 length==(int32_t)(otherLimit-otherStart) &&
67 0==u_memcmp(start, otherStart, length);
68 }
69
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
71 if(remainingCapacity<2 && !resize(2, errorCode)) {
72 return FALSE;
73 }
74 if(lastCC<=cc || cc==0) {
75 limit[0]=U16_LEAD(c);
76 limit[1]=U16_TRAIL(c);
77 limit+=2;
78 lastCC=cc;
79 if(cc<=1) {
80 reorderStart=limit;
81 }
82 } else {
83 insert(c, cc);
84 }
85 remainingCapacity-=2;
86 return TRUE;
87 }
88
append(const UChar * s,int32_t length,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)89 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
90 uint8_t leadCC, uint8_t trailCC,
91 UErrorCode &errorCode) {
92 if(length==0) {
93 return TRUE;
94 }
95 if(remainingCapacity<length && !resize(length, errorCode)) {
96 return FALSE;
97 }
98 remainingCapacity-=length;
99 if(lastCC<=leadCC || leadCC==0) {
100 if(trailCC<=1) {
101 reorderStart=limit+length;
102 } else if(leadCC<=1) {
103 reorderStart=limit+1; // Ok if not a code point boundary.
104 }
105 const UChar *sLimit=s+length;
106 do { *limit++=*s++; } while(s!=sLimit);
107 lastCC=trailCC;
108 } else {
109 int32_t i=0;
110 UChar32 c;
111 U16_NEXT(s, i, length, c);
112 insert(c, leadCC); // insert first code point
113 while(i<length) {
114 U16_NEXT(s, i, length, c);
115 if(i<length) {
116 // s must be in NFD, otherwise we need to use getCC().
117 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
118 } else {
119 leadCC=trailCC;
120 }
121 append(c, leadCC, errorCode);
122 }
123 }
124 return TRUE;
125 }
126
appendZeroCC(UChar32 c,UErrorCode & errorCode)127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
128 int32_t cpLength=U16_LENGTH(c);
129 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
130 return FALSE;
131 }
132 remainingCapacity-=cpLength;
133 if(cpLength==1) {
134 *limit++=(UChar)c;
135 } else {
136 limit[0]=U16_LEAD(c);
137 limit[1]=U16_TRAIL(c);
138 limit+=2;
139 }
140 lastCC=0;
141 reorderStart=limit;
142 return TRUE;
143 }
144
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
146 if(s==sLimit) {
147 return TRUE;
148 }
149 int32_t length=(int32_t)(sLimit-s);
150 if(remainingCapacity<length && !resize(length, errorCode)) {
151 return FALSE;
152 }
153 u_memcpy(limit, s, length);
154 limit+=length;
155 remainingCapacity-=length;
156 lastCC=0;
157 reorderStart=limit;
158 return TRUE;
159 }
160
remove()161 void ReorderingBuffer::remove() {
162 reorderStart=limit=start;
163 remainingCapacity=str.getCapacity();
164 lastCC=0;
165 }
166
removeSuffix(int32_t suffixLength)167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
168 if(suffixLength<(limit-start)) {
169 limit-=suffixLength;
170 remainingCapacity+=suffixLength;
171 } else {
172 limit=start;
173 remainingCapacity=str.getCapacity();
174 }
175 lastCC=0;
176 reorderStart=limit;
177 }
178
resize(int32_t appendLength,UErrorCode & errorCode)179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
180 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
181 int32_t length=(int32_t)(limit-start);
182 str.releaseBuffer(length);
183 int32_t newCapacity=length+appendLength;
184 int32_t doubleCapacity=2*str.getCapacity();
185 if(newCapacity<doubleCapacity) {
186 newCapacity=doubleCapacity;
187 }
188 if(newCapacity<256) {
189 newCapacity=256;
190 }
191 start=str.getBuffer(newCapacity);
192 if(start==NULL) {
193 // getBuffer() already did str.setToBogus()
194 errorCode=U_MEMORY_ALLOCATION_ERROR;
195 return FALSE;
196 }
197 reorderStart=start+reorderStartIndex;
198 limit=start+length;
199 remainingCapacity=str.getCapacity()-length;
200 return TRUE;
201 }
202
skipPrevious()203 void ReorderingBuffer::skipPrevious() {
204 codePointLimit=codePointStart;
205 UChar c=*--codePointStart;
206 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
207 --codePointStart;
208 }
209 }
210
previousCC()211 uint8_t ReorderingBuffer::previousCC() {
212 codePointLimit=codePointStart;
213 if(reorderStart>=codePointStart) {
214 return 0;
215 }
216 UChar32 c=*--codePointStart;
217 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
218 return 0;
219 }
220
221 UChar c2;
222 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
223 --codePointStart;
224 c=U16_GET_SUPPLEMENTARY(c2, c);
225 }
226 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
227 }
228
229 // Inserts c somewhere before the last character.
230 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
232 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
233 // insert c at codePointLimit, after the character with prevCC<=cc
234 UChar *q=limit;
235 UChar *r=limit+=U16_LENGTH(c);
236 do {
237 *--r=*--q;
238 } while(codePointLimit!=q);
239 writeCodePoint(q, c);
240 if(cc<=1) {
241 reorderStart=r;
242 }
243 }
244
245 // Normalizer2Impl --------------------------------------------------------- ***
246
247 struct CanonIterData : public UMemory {
248 CanonIterData(UErrorCode &errorCode);
249 ~CanonIterData();
250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
251 UTrie2 *trie;
252 UVector canonStartSets; // contains UnicodeSet *
253 };
254
~Normalizer2Impl()255 Normalizer2Impl::~Normalizer2Impl() {
256 delete fCanonIterData;
257 }
258
259 void
init(const int32_t * inIndexes,const UTrie2 * inTrie,const uint16_t * inExtraData,const uint8_t * inSmallFCD)260 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
261 const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
262 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
263 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
264
265 minYesNo=inIndexes[IX_MIN_YES_NO];
266 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
267 minNoNo=inIndexes[IX_MIN_NO_NO];
268 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
269 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
270
271 normTrie=inTrie;
272
273 maybeYesCompositions=inExtraData;
274 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
275
276 smallFCD=inSmallFCD;
277
278 // Build tccc180[].
279 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
280 uint8_t bits=0;
281 for(UChar c=0; c<0x180; bits>>=1) {
282 if((c&0xff)==0) {
283 bits=smallFCD[c>>8]; // one byte per 0x100 code points
284 }
285 if(bits&1) {
286 for(int i=0; i<0x20; ++i, ++c) {
287 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
288 }
289 } else {
290 uprv_memset(tccc180+c, 0, 0x20);
291 c+=0x20;
292 }
293 }
294 }
295
getTrailCCFromCompYesAndZeroCC(const UChar * cpStart,const UChar * cpLimit) const296 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
297 UChar32 c;
298 if(cpStart==(cpLimit-1)) {
299 c=*cpStart;
300 } else {
301 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
302 }
303 uint16_t prevNorm16=getNorm16(c);
304 if(prevNorm16<=minYesNo) {
305 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
306 } else {
307 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
308 }
309 }
310
311 namespace {
312
313 class LcccContext {
314 public:
LcccContext(const Normalizer2Impl & ni,UnicodeSet & s)315 LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
316
handleRange(UChar32 start,UChar32 end,uint16_t norm16)317 void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
318 if(impl.isAlgorithmicNoNo(norm16)) {
319 // Range of code points with same-norm16-value algorithmic decompositions.
320 // They might have different non-zero FCD16 values.
321 do {
322 uint16_t fcd16=impl.getFCD16(start);
323 if(fcd16>0xff) { set.add(start); }
324 } while(++start<=end);
325 } else {
326 uint16_t fcd16=impl.getFCD16(start);
327 if(fcd16>0xff) { set.add(start, end); }
328 }
329 }
330
331 private:
332 const Normalizer2Impl &impl;
333 UnicodeSet &set;
334 };
335
336 struct PropertyStartsContext {
PropertyStartsContext__anon7311a2d80111::PropertyStartsContext337 PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
338 : impl(ni), sa(adder) {}
339
340 const Normalizer2Impl &impl;
341 const USetAdder *sa;
342 };
343
344 } // namespace
345
346 U_CDECL_BEGIN
347
348 static UBool U_CALLCONV
enumLcccRange(const void * context,UChar32 start,UChar32 end,uint32_t value)349 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
350 ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
351 return TRUE;
352 }
353
354 static UBool U_CALLCONV
enumNorm16PropertyStartsRange(const void * context,UChar32 start,UChar32 end,uint32_t value)355 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
356 /* add the start code point to the USet */
357 const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
358 const USetAdder *sa=ctx->sa;
359 sa->add(sa->set, start);
360 if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) {
361 // Range of code points with same-norm16-value algorithmic decompositions.
362 // They might have different non-zero FCD16 values.
363 uint16_t prevFCD16=ctx->impl.getFCD16(start);
364 while(++start<=end) {
365 uint16_t fcd16=ctx->impl.getFCD16(start);
366 if(fcd16!=prevFCD16) {
367 sa->add(sa->set, start);
368 prevFCD16=fcd16;
369 }
370 }
371 }
372 return TRUE;
373 }
374
375 static UBool U_CALLCONV
enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)376 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
377 /* add the start code point to the USet */
378 const USetAdder *sa=(const USetAdder *)context;
379 sa->add(sa->set, start);
380 return TRUE;
381 }
382
383 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)384 segmentStarterMapper(const void * /*context*/, uint32_t value) {
385 return value&CANON_NOT_SEGMENT_STARTER;
386 }
387
388 U_CDECL_END
389
390 void
addLcccChars(UnicodeSet & set) const391 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
392 /* add the start code point of each same-value range of each trie */
393 LcccContext context(*this, set);
394 utrie2_enum(normTrie, NULL, enumLcccRange, &context);
395 }
396
397 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const398 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
399 /* add the start code point of each same-value range of each trie */
400 PropertyStartsContext context(*this, sa);
401 utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
402
403 /* add Hangul LV syllables and LV+1 because of skippables */
404 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
405 sa->add(sa->set, c);
406 sa->add(sa->set, c+1);
407 }
408 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
409 }
410
411 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const412 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
413 /* add the start code point of each same-value range of the canonical iterator data trie */
414 if(ensureCanonIterData(errorCode)) {
415 // currently only used for the SEGMENT_STARTER property
416 utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
417 }
418 }
419
420 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const421 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
422 UChar32 minNeedDataCP,
423 ReorderingBuffer *buffer,
424 UErrorCode &errorCode) const {
425 // Make some effort to support NUL-terminated strings reasonably.
426 // Take the part of the fast quick check loop that does not look up
427 // data and check the first part of the string.
428 // After this prefix, determine the string length to simplify the rest
429 // of the code.
430 const UChar *prevSrc=src;
431 UChar c;
432 while((c=*src++)<minNeedDataCP && c!=0) {}
433 // Back out the last character for full processing.
434 // Copy this prefix.
435 if(--src!=prevSrc) {
436 if(buffer!=NULL) {
437 buffer->appendZeroCC(prevSrc, src, errorCode);
438 }
439 }
440 return src;
441 }
442
443 UnicodeString &
decompose(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const444 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
445 UErrorCode &errorCode) const {
446 if(U_FAILURE(errorCode)) {
447 dest.setToBogus();
448 return dest;
449 }
450 const UChar *sArray=src.getBuffer();
451 if(&dest==&src || sArray==NULL) {
452 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
453 dest.setToBogus();
454 return dest;
455 }
456 decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
457 return dest;
458 }
459
460 void
decompose(const UChar * src,const UChar * limit,UnicodeString & dest,int32_t destLengthEstimate,UErrorCode & errorCode) const461 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
462 UnicodeString &dest,
463 int32_t destLengthEstimate,
464 UErrorCode &errorCode) const {
465 if(destLengthEstimate<0 && limit!=NULL) {
466 destLengthEstimate=(int32_t)(limit-src);
467 }
468 dest.remove();
469 ReorderingBuffer buffer(*this, dest);
470 if(buffer.init(destLengthEstimate, errorCode)) {
471 decompose(src, limit, &buffer, errorCode);
472 }
473 }
474
475 // Dual functionality:
476 // buffer!=NULL: normalize
477 // buffer==NULL: isNormalized/spanQuickCheckYes
478 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const479 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
480 ReorderingBuffer *buffer,
481 UErrorCode &errorCode) const {
482 UChar32 minNoCP=minDecompNoCP;
483 if(limit==NULL) {
484 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
485 if(U_FAILURE(errorCode)) {
486 return src;
487 }
488 limit=u_strchr(src, 0);
489 }
490
491 const UChar *prevSrc;
492 UChar32 c=0;
493 uint16_t norm16=0;
494
495 // only for quick check
496 const UChar *prevBoundary=src;
497 uint8_t prevCC=0;
498
499 for(;;) {
500 // count code units below the minimum or with irrelevant data for the quick check
501 for(prevSrc=src; src!=limit;) {
502 if( (c=*src)<minNoCP ||
503 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
504 ) {
505 ++src;
506 } else if(!U16_IS_SURROGATE(c)) {
507 break;
508 } else {
509 UChar c2;
510 if(U16_IS_SURROGATE_LEAD(c)) {
511 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
512 c=U16_GET_SUPPLEMENTARY(c, c2);
513 }
514 } else /* trail surrogate */ {
515 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
516 --src;
517 c=U16_GET_SUPPLEMENTARY(c2, c);
518 }
519 }
520 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
521 src+=U16_LENGTH(c);
522 } else {
523 break;
524 }
525 }
526 }
527 // copy these code units all at once
528 if(src!=prevSrc) {
529 if(buffer!=NULL) {
530 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
531 break;
532 }
533 } else {
534 prevCC=0;
535 prevBoundary=src;
536 }
537 }
538 if(src==limit) {
539 break;
540 }
541
542 // Check one above-minimum, relevant code point.
543 src+=U16_LENGTH(c);
544 if(buffer!=NULL) {
545 if(!decompose(c, norm16, *buffer, errorCode)) {
546 break;
547 }
548 } else {
549 if(isDecompYes(norm16)) {
550 uint8_t cc=getCCFromYesOrMaybe(norm16);
551 if(prevCC<=cc || cc==0) {
552 prevCC=cc;
553 if(cc<=1) {
554 prevBoundary=src;
555 }
556 continue;
557 }
558 }
559 return prevBoundary; // "no" or cc out of order
560 }
561 }
562 return src;
563 }
564
565 // Decompose a short piece of text which is likely to contain characters that
566 // fail the quick check loop and/or where the quick check loop's overhead
567 // is unlikely to be amortized.
568 // Called by the compose() and makeFCD() implementations.
decomposeShort(const UChar * src,const UChar * limit,ReorderingBuffer & buffer,UErrorCode & errorCode) const569 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
570 ReorderingBuffer &buffer,
571 UErrorCode &errorCode) const {
572 while(src<limit) {
573 UChar32 c;
574 uint16_t norm16;
575 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
576 if(!decompose(c, norm16, buffer, errorCode)) {
577 return FALSE;
578 }
579 }
580 return TRUE;
581 }
582
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const583 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
584 ReorderingBuffer &buffer,
585 UErrorCode &errorCode) const {
586 // Only loops for 1:1 algorithmic mappings.
587 for(;;) {
588 // get the decomposition and the lead and trail cc's
589 if(isDecompYes(norm16)) {
590 // c does not decompose
591 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
592 } else if(isHangul(norm16)) {
593 // Hangul syllable: decompose algorithmically
594 UChar jamos[3];
595 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
596 } else if(isDecompNoAlgorithmic(norm16)) {
597 c=mapAlgorithmic(c, norm16);
598 norm16=getNorm16(c);
599 } else {
600 // c decomposes, get everything from the variable-length extra data
601 const uint16_t *mapping=getMapping(norm16);
602 uint16_t firstUnit=*mapping;
603 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
604 uint8_t leadCC, trailCC;
605 trailCC=(uint8_t)(firstUnit>>8);
606 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
607 leadCC=(uint8_t)(*(mapping-1)>>8);
608 } else {
609 leadCC=0;
610 }
611 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
612 }
613 }
614 }
615
616 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const617 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
618 const UChar *decomp=NULL;
619 uint16_t norm16;
620 for(;;) {
621 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
622 // c does not decompose
623 return decomp;
624 } else if(isHangul(norm16)) {
625 // Hangul syllable: decompose algorithmically
626 length=Hangul::decompose(c, buffer);
627 return buffer;
628 } else if(isDecompNoAlgorithmic(norm16)) {
629 c=mapAlgorithmic(c, norm16);
630 decomp=buffer;
631 length=0;
632 U16_APPEND_UNSAFE(buffer, length, c);
633 } else {
634 // c decomposes, get everything from the variable-length extra data
635 const uint16_t *mapping=getMapping(norm16);
636 length=*mapping&MAPPING_LENGTH_MASK;
637 return (const UChar *)mapping+1;
638 }
639 }
640 }
641
642 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
643 // so that a raw mapping fits that consists of one unit ("rm0")
644 // plus all but the first two code units of the normal mapping.
645 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
646 const UChar *
getRawDecomposition(UChar32 c,UChar buffer[30],int32_t & length) const647 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
648 // We do not loop in this method because an algorithmic mapping itself
649 // becomes a final result rather than having to be decomposed recursively.
650 uint16_t norm16;
651 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
652 // c does not decompose
653 return NULL;
654 } else if(isHangul(norm16)) {
655 // Hangul syllable: decompose algorithmically
656 Hangul::getRawDecomposition(c, buffer);
657 length=2;
658 return buffer;
659 } else if(isDecompNoAlgorithmic(norm16)) {
660 c=mapAlgorithmic(c, norm16);
661 length=0;
662 U16_APPEND_UNSAFE(buffer, length, c);
663 return buffer;
664 } else {
665 // c decomposes, get everything from the variable-length extra data
666 const uint16_t *mapping=getMapping(norm16);
667 uint16_t firstUnit=*mapping;
668 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
669 if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
670 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
671 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
672 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
673 uint16_t rm0=*rawMapping;
674 if(rm0<=MAPPING_LENGTH_MASK) {
675 length=rm0;
676 return (const UChar *)rawMapping-rm0;
677 } else {
678 // Copy the normal mapping and replace its first two code units with rm0.
679 buffer[0]=(UChar)rm0;
680 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
681 length=mLength-1;
682 return buffer;
683 }
684 } else {
685 length=mLength;
686 return (const UChar *)mapping+1;
687 }
688 }
689 }
690
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const691 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
692 UBool doDecompose,
693 UnicodeString &safeMiddle,
694 ReorderingBuffer &buffer,
695 UErrorCode &errorCode) const {
696 buffer.copyReorderableSuffixTo(safeMiddle);
697 if(doDecompose) {
698 decompose(src, limit, &buffer, errorCode);
699 return;
700 }
701 // Just merge the strings at the boundary.
702 ForwardUTrie2StringIterator iter(normTrie, src, limit);
703 uint8_t firstCC, prevCC, cc;
704 firstCC=prevCC=cc=getCC(iter.next16());
705 while(cc!=0) {
706 prevCC=cc;
707 cc=getCC(iter.next16());
708 };
709 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
710 limit=u_strchr(iter.codePointStart, 0);
711 }
712
713 if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
714 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
715 }
716 }
717
718 // Note: hasDecompBoundary() could be implemented as aliases to
719 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
720 // at the cost of building the FCD trie for a decomposition normalizer.
hasDecompBoundary(UChar32 c,UBool before) const721 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
722 for(;;) {
723 if(c<minDecompNoCP) {
724 return TRUE;
725 }
726 uint16_t norm16=getNorm16(c);
727 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
728 return TRUE;
729 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
730 return FALSE; // ccc!=0
731 } else if(isDecompNoAlgorithmic(norm16)) {
732 c=mapAlgorithmic(c, norm16);
733 } else {
734 // c decomposes, get everything from the variable-length extra data
735 const uint16_t *mapping=getMapping(norm16);
736 uint16_t firstUnit=*mapping;
737 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
738 return FALSE;
739 }
740 if(!before) {
741 // decomp after-boundary: same as hasFCDBoundaryAfter(),
742 // fcd16<=1 || trailCC==0
743 if(firstUnit>0x1ff) {
744 return FALSE; // trailCC>1
745 }
746 if(firstUnit<=0xff) {
747 return TRUE; // trailCC==0
748 }
749 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
750 }
751 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
752 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
753 }
754 }
755 }
756
757 /*
758 * Finds the recomposition result for
759 * a forward-combining "lead" character,
760 * specified with a pointer to its compositions list,
761 * and a backward-combining "trail" character.
762 *
763 * If the lead and trail characters combine, then this function returns
764 * the following "compositeAndFwd" value:
765 * Bits 21..1 composite character
766 * Bit 0 set if the composite is a forward-combining starter
767 * otherwise it returns -1.
768 *
769 * The compositions list has (trail, compositeAndFwd) pair entries,
770 * encoded as either pairs or triples of 16-bit units.
771 * The last entry has the high bit of its first unit set.
772 *
773 * The list is sorted by ascending trail characters (there are no duplicates).
774 * A linear search is used.
775 *
776 * See normalizer2impl.h for a more detailed description
777 * of the compositions list format.
778 */
combine(const uint16_t * list,UChar32 trail)779 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
780 uint16_t key1, firstUnit;
781 if(trail<COMP_1_TRAIL_LIMIT) {
782 // trail character is 0..33FF
783 // result entry may have 2 or 3 units
784 key1=(uint16_t)(trail<<1);
785 while(key1>(firstUnit=*list)) {
786 list+=2+(firstUnit&COMP_1_TRIPLE);
787 }
788 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
789 if(firstUnit&COMP_1_TRIPLE) {
790 return ((int32_t)list[1]<<16)|list[2];
791 } else {
792 return list[1];
793 }
794 }
795 } else {
796 // trail character is 3400..10FFFF
797 // result entry has 3 units
798 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
799 (((trail>>COMP_1_TRAIL_SHIFT))&
800 ~COMP_1_TRIPLE));
801 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
802 uint16_t secondUnit;
803 for(;;) {
804 if(key1>(firstUnit=*list)) {
805 list+=2+(firstUnit&COMP_1_TRIPLE);
806 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
807 if(key2>(secondUnit=list[1])) {
808 if(firstUnit&COMP_1_LAST_TUPLE) {
809 break;
810 } else {
811 list+=3;
812 }
813 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
814 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
815 } else {
816 break;
817 }
818 } else {
819 break;
820 }
821 }
822 }
823 return -1;
824 }
825
826 /**
827 * @param list some character's compositions list
828 * @param set recursively receives the composites from these compositions
829 */
addComposites(const uint16_t * list,UnicodeSet & set) const830 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
831 uint16_t firstUnit;
832 int32_t compositeAndFwd;
833 do {
834 firstUnit=*list;
835 if((firstUnit&COMP_1_TRIPLE)==0) {
836 compositeAndFwd=list[1];
837 list+=2;
838 } else {
839 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
840 list+=3;
841 }
842 UChar32 composite=compositeAndFwd>>1;
843 if((compositeAndFwd&1)!=0) {
844 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
845 }
846 set.add(composite);
847 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
848 }
849
850 /*
851 * Recomposes the buffer text starting at recomposeStartIndex
852 * (which is in NFD - decomposed and canonically ordered),
853 * and truncates the buffer contents.
854 *
855 * Note that recomposition never lengthens the text:
856 * Any character consists of either one or two code units;
857 * a composition may contain at most one more code unit than the original starter,
858 * while the combining mark that is removed has at least one code unit.
859 */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const860 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
861 UBool onlyContiguous) const {
862 UChar *p=buffer.getStart()+recomposeStartIndex;
863 UChar *limit=buffer.getLimit();
864 if(p==limit) {
865 return;
866 }
867
868 UChar *starter, *pRemove, *q, *r;
869 const uint16_t *compositionsList;
870 UChar32 c, compositeAndFwd;
871 uint16_t norm16;
872 uint8_t cc, prevCC;
873 UBool starterIsSupplementary;
874
875 // Some of the following variables are not used until we have a forward-combining starter
876 // and are only initialized now to avoid compiler warnings.
877 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
878 starter=NULL;
879 starterIsSupplementary=FALSE;
880 prevCC=0;
881
882 for(;;) {
883 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
884 cc=getCCFromYesOrMaybe(norm16);
885 if( // this character combines backward and
886 isMaybe(norm16) &&
887 // we have seen a starter that combines forward and
888 compositionsList!=NULL &&
889 // the backward-combining character is not blocked
890 (prevCC<cc || prevCC==0)
891 ) {
892 if(isJamoVT(norm16)) {
893 // c is a Jamo V/T, see if we can compose it with the previous character.
894 if(c<Hangul::JAMO_T_BASE) {
895 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
896 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
897 if(prev<Hangul::JAMO_L_COUNT) {
898 pRemove=p-1;
899 UChar syllable=(UChar)
900 (Hangul::HANGUL_BASE+
901 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
902 Hangul::JAMO_T_COUNT);
903 UChar t;
904 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
905 ++p;
906 syllable+=t; // The next character was a Jamo T.
907 }
908 *starter=syllable;
909 // remove the Jamo V/T
910 q=pRemove;
911 r=p;
912 while(r<limit) {
913 *q++=*r++;
914 }
915 limit=q;
916 p=pRemove;
917 }
918 }
919 /*
920 * No "else" for Jamo T:
921 * Since the input is in NFD, there are no Hangul LV syllables that
922 * a Jamo T could combine with.
923 * All Jamo Ts are combined above when handling Jamo Vs.
924 */
925 if(p==limit) {
926 break;
927 }
928 compositionsList=NULL;
929 continue;
930 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
931 // The starter and the combining mark (c) do combine.
932 UChar32 composite=compositeAndFwd>>1;
933
934 // Replace the starter with the composite, remove the combining mark.
935 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
936 if(starterIsSupplementary) {
937 if(U_IS_SUPPLEMENTARY(composite)) {
938 // both are supplementary
939 starter[0]=U16_LEAD(composite);
940 starter[1]=U16_TRAIL(composite);
941 } else {
942 *starter=(UChar)composite;
943 // The composite is shorter than the starter,
944 // move the intermediate characters forward one.
945 starterIsSupplementary=FALSE;
946 q=starter+1;
947 r=q+1;
948 while(r<pRemove) {
949 *q++=*r++;
950 }
951 --pRemove;
952 }
953 } else if(U_IS_SUPPLEMENTARY(composite)) {
954 // The composite is longer than the starter,
955 // move the intermediate characters back one.
956 starterIsSupplementary=TRUE;
957 ++starter; // temporarily increment for the loop boundary
958 q=pRemove;
959 r=++pRemove;
960 while(starter<q) {
961 *--r=*--q;
962 }
963 *starter=U16_TRAIL(composite);
964 *--starter=U16_LEAD(composite); // undo the temporary increment
965 } else {
966 // both are on the BMP
967 *starter=(UChar)composite;
968 }
969
970 /* remove the combining mark by moving the following text over it */
971 if(pRemove<p) {
972 q=pRemove;
973 r=p;
974 while(r<limit) {
975 *q++=*r++;
976 }
977 limit=q;
978 p=pRemove;
979 }
980 // Keep prevCC because we removed the combining mark.
981
982 if(p==limit) {
983 break;
984 }
985 // Is the composite a starter that combines forward?
986 if(compositeAndFwd&1) {
987 compositionsList=
988 getCompositionsListForComposite(getNorm16(composite));
989 } else {
990 compositionsList=NULL;
991 }
992
993 // We combined; continue with looking for compositions.
994 continue;
995 }
996 }
997
998 // no combination this time
999 prevCC=cc;
1000 if(p==limit) {
1001 break;
1002 }
1003
1004 // If c did not combine, then check if it is a starter.
1005 if(cc==0) {
1006 // Found a new starter.
1007 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1008 // It may combine with something, prepare for it.
1009 if(U_IS_BMP(c)) {
1010 starterIsSupplementary=FALSE;
1011 starter=p-1;
1012 } else {
1013 starterIsSupplementary=TRUE;
1014 starter=p-2;
1015 }
1016 }
1017 } else if(onlyContiguous) {
1018 // FCC: no discontiguous compositions; any intervening character blocks.
1019 compositionsList=NULL;
1020 }
1021 }
1022 buffer.setReorderingLimit(limit);
1023 }
1024
1025 UChar32
composePair(UChar32 a,UChar32 b) const1026 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1027 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
1028 const uint16_t *list;
1029 if(isInert(norm16)) {
1030 return U_SENTINEL;
1031 } else if(norm16<minYesNoMappingsOnly) {
1032 if(isJamoL(norm16)) {
1033 b-=Hangul::JAMO_V_BASE;
1034 if(0<=b && b<Hangul::JAMO_V_COUNT) {
1035 return
1036 (Hangul::HANGUL_BASE+
1037 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1038 Hangul::JAMO_T_COUNT);
1039 } else {
1040 return U_SENTINEL;
1041 }
1042 } else if(isHangul(norm16)) {
1043 b-=Hangul::JAMO_T_BASE;
1044 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
1045 return a+b;
1046 } else {
1047 return U_SENTINEL;
1048 }
1049 } else {
1050 // 'a' has a compositions list in extraData
1051 list=extraData+norm16;
1052 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
1053 list+= // mapping pointer
1054 1+ // +1 to skip the first unit with the mapping lenth
1055 (*list&MAPPING_LENGTH_MASK); // + mapping length
1056 }
1057 }
1058 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1059 return U_SENTINEL;
1060 } else {
1061 list=maybeYesCompositions+norm16-minMaybeYes;
1062 }
1063 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
1064 return U_SENTINEL;
1065 }
1066 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1067 return combine(list, b)>>1;
1068 #else
1069 int32_t compositeAndFwd=combine(list, b);
1070 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1071 #endif
1072 }
1073
1074 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1075 // doCompose: normalize
1076 // !doCompose: isNormalized (buffer must be empty and initialized)
1077 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const1078 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1079 UBool onlyContiguous,
1080 UBool doCompose,
1081 ReorderingBuffer &buffer,
1082 UErrorCode &errorCode) const {
1083 /*
1084 * prevBoundary points to the last character before the current one
1085 * that has a composition boundary before it with ccc==0 and quick check "yes".
1086 * Keeping track of prevBoundary saves us looking for a composition boundary
1087 * when we find a "no" or "maybe".
1088 *
1089 * When we back out from prevSrc back to prevBoundary,
1090 * then we also remove those same characters (which had been simply copied
1091 * or canonically-order-inserted) from the ReorderingBuffer.
1092 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1093 * must correspond 1:1 to destination units at the end of the destination buffer.
1094 */
1095 const UChar *prevBoundary=src;
1096 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1097 if(limit==NULL) {
1098 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1099 doCompose ? &buffer : NULL,
1100 errorCode);
1101 if(U_FAILURE(errorCode)) {
1102 return FALSE;
1103 }
1104 if(prevBoundary<src) {
1105 // Set prevBoundary to the last character in the prefix.
1106 prevBoundary=src-1;
1107 }
1108 limit=u_strchr(src, 0);
1109 }
1110
1111 const UChar *prevSrc;
1112 UChar32 c=0;
1113 uint16_t norm16=0;
1114
1115 // only for isNormalized
1116 uint8_t prevCC=0;
1117
1118 for(;;) {
1119 // count code units below the minimum or with irrelevant data for the quick check
1120 for(prevSrc=src; src!=limit;) {
1121 if( (c=*src)<minNoMaybeCP ||
1122 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1123 ) {
1124 ++src;
1125 } else if(!U16_IS_SURROGATE(c)) {
1126 break;
1127 } else {
1128 UChar c2;
1129 if(U16_IS_SURROGATE_LEAD(c)) {
1130 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1131 c=U16_GET_SUPPLEMENTARY(c, c2);
1132 }
1133 } else /* trail surrogate */ {
1134 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1135 --src;
1136 c=U16_GET_SUPPLEMENTARY(c2, c);
1137 }
1138 }
1139 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1140 src+=U16_LENGTH(c);
1141 } else {
1142 break;
1143 }
1144 }
1145 }
1146 // copy these code units all at once
1147 if(src!=prevSrc) {
1148 if(doCompose) {
1149 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
1150 break;
1151 }
1152 } else {
1153 prevCC=0;
1154 }
1155 if(src==limit) {
1156 break;
1157 }
1158 // Set prevBoundary to the last character in the quick check loop.
1159 prevBoundary=src-1;
1160 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1161 U16_IS_LEAD(*(prevBoundary-1))
1162 ) {
1163 --prevBoundary;
1164 }
1165 // The start of the current character (c).
1166 prevSrc=src;
1167 } else if(src==limit) {
1168 break;
1169 }
1170
1171 src+=U16_LENGTH(c);
1172 /*
1173 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1174 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1175 * or has ccc!=0.
1176 * Check for Jamo V/T, then for regular characters.
1177 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1178 */
1179 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1180 UChar prev=*(prevSrc-1);
1181 UBool needToDecompose=FALSE;
1182 if(c<Hangul::JAMO_T_BASE) {
1183 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1184 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1185 if(prev<Hangul::JAMO_L_COUNT) {
1186 if(!doCompose) {
1187 return FALSE;
1188 }
1189 UChar syllable=(UChar)
1190 (Hangul::HANGUL_BASE+
1191 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1192 Hangul::JAMO_T_COUNT);
1193 UChar t;
1194 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1195 ++src;
1196 syllable+=t; // The next character was a Jamo T.
1197 prevBoundary=src;
1198 buffer.setLastChar(syllable);
1199 continue;
1200 }
1201 // If we see L+V+x where x!=T then we drop to the slow path,
1202 // decompose and recompose.
1203 // This is to deal with NFKC finding normal L and V but a
1204 // compatibility variant of a T. We need to either fully compose that
1205 // combination here (which would complicate the code and may not work
1206 // with strange custom data) or use the slow path -- or else our replacing
1207 // two input characters (L+V) with one output character (LV syllable)
1208 // would violate the invariant that [prevBoundary..prevSrc[ has the same
1209 // length as what we appended to the buffer since prevBoundary.
1210 needToDecompose=TRUE;
1211 }
1212 } else if(Hangul::isHangulWithoutJamoT(prev)) {
1213 // c is a Jamo Trailing consonant,
1214 // compose with previous Hangul LV that does not contain a Jamo T.
1215 if(!doCompose) {
1216 return FALSE;
1217 }
1218 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1219 prevBoundary=src;
1220 continue;
1221 }
1222 if(!needToDecompose) {
1223 // The Jamo V/T did not compose into a Hangul syllable.
1224 if(doCompose) {
1225 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1226 break;
1227 }
1228 } else {
1229 prevCC=0;
1230 }
1231 continue;
1232 }
1233 }
1234 /*
1235 * Source buffer pointers:
1236 *
1237 * all done quick check current char not yet
1238 * "yes" but (c) processed
1239 * may combine
1240 * forward
1241 * [-------------[-------------[-------------[-------------[
1242 * | | | | |
1243 * orig. src prevBoundary prevSrc src limit
1244 *
1245 *
1246 * Destination buffer pointers inside the ReorderingBuffer:
1247 *
1248 * all done might take not filled yet
1249 * characters for
1250 * reordering
1251 * [-------------[-------------[-------------[
1252 * | | | |
1253 * start reorderStart limit |
1254 * +remainingCap.+
1255 */
1256 if(norm16>=MIN_YES_YES_WITH_CC) {
1257 uint8_t cc=(uint8_t)norm16; // cc!=0
1258 if( onlyContiguous && // FCC
1259 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1260 prevBoundary<prevSrc &&
1261 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1262 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1263 // passed the quick check "yes && ccc==0" test.
1264 // Check whether the last character was a "yesYes" or a "yesNo".
1265 // If a "yesNo", then we get its trailing ccc from its
1266 // mapping and check for canonical order.
1267 // All other cases are ok.
1268 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1269 ) {
1270 // Fails FCD test, need to decompose and contiguously recompose.
1271 if(!doCompose) {
1272 return FALSE;
1273 }
1274 } else if(doCompose) {
1275 if(!buffer.append(c, cc, errorCode)) {
1276 break;
1277 }
1278 continue;
1279 } else if(prevCC<=cc) {
1280 prevCC=cc;
1281 continue;
1282 } else {
1283 return FALSE;
1284 }
1285 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1286 return FALSE;
1287 }
1288
1289 /*
1290 * Find appropriate boundaries around this character,
1291 * decompose the source text from between the boundaries,
1292 * and recompose it.
1293 *
1294 * We may need to remove the last few characters from the ReorderingBuffer
1295 * to account for source text that was copied or appended
1296 * but needs to take part in the recomposition.
1297 */
1298
1299 /*
1300 * Find the last composition boundary in [prevBoundary..src[.
1301 * It is either the decomposition of the current character (at prevSrc),
1302 * or prevBoundary.
1303 */
1304 if(hasCompBoundaryBefore(c, norm16)) {
1305 prevBoundary=prevSrc;
1306 } else if(doCompose) {
1307 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1308 }
1309
1310 // Find the next composition boundary in [src..limit[ -
1311 // modifies src to point to the next starter.
1312 src=(UChar *)findNextCompBoundary(src, limit);
1313
1314 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1315 int32_t recomposeStartIndex=buffer.length();
1316 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1317 break;
1318 }
1319 recompose(buffer, recomposeStartIndex, onlyContiguous);
1320 if(!doCompose) {
1321 if(!buffer.equals(prevBoundary, src)) {
1322 return FALSE;
1323 }
1324 buffer.remove();
1325 prevCC=0;
1326 }
1327
1328 // Move to the next starter. We never need to look back before this point again.
1329 prevBoundary=src;
1330 }
1331 return TRUE;
1332 }
1333
1334 // Very similar to compose(): Make the same changes in both places if relevant.
1335 // pQCResult==NULL: spanQuickCheckYes
1336 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1337 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1338 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1339 UBool onlyContiguous,
1340 UNormalizationCheckResult *pQCResult) const {
1341 /*
1342 * prevBoundary points to the last character before the current one
1343 * that has a composition boundary before it with ccc==0 and quick check "yes".
1344 */
1345 const UChar *prevBoundary=src;
1346 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1347 if(limit==NULL) {
1348 UErrorCode errorCode=U_ZERO_ERROR;
1349 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1350 if(prevBoundary<src) {
1351 // Set prevBoundary to the last character in the prefix.
1352 prevBoundary=src-1;
1353 }
1354 limit=u_strchr(src, 0);
1355 }
1356
1357 const UChar *prevSrc;
1358 UChar32 c=0;
1359 uint16_t norm16=0;
1360 uint8_t prevCC=0;
1361
1362 for(;;) {
1363 // count code units below the minimum or with irrelevant data for the quick check
1364 for(prevSrc=src;;) {
1365 if(src==limit) {
1366 return src;
1367 }
1368 if( (c=*src)<minNoMaybeCP ||
1369 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1370 ) {
1371 ++src;
1372 } else if(!U16_IS_SURROGATE(c)) {
1373 break;
1374 } else {
1375 UChar c2;
1376 if(U16_IS_SURROGATE_LEAD(c)) {
1377 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1378 c=U16_GET_SUPPLEMENTARY(c, c2);
1379 }
1380 } else /* trail surrogate */ {
1381 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1382 --src;
1383 c=U16_GET_SUPPLEMENTARY(c2, c);
1384 }
1385 }
1386 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1387 src+=U16_LENGTH(c);
1388 } else {
1389 break;
1390 }
1391 }
1392 }
1393 if(src!=prevSrc) {
1394 // Set prevBoundary to the last character in the quick check loop.
1395 prevBoundary=src-1;
1396 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1397 U16_IS_LEAD(*(prevBoundary-1))
1398 ) {
1399 --prevBoundary;
1400 }
1401 prevCC=0;
1402 // The start of the current character (c).
1403 prevSrc=src;
1404 }
1405
1406 src+=U16_LENGTH(c);
1407 /*
1408 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1409 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1410 * or has ccc!=0.
1411 */
1412 if(isMaybeOrNonZeroCC(norm16)) {
1413 uint8_t cc=getCCFromYesOrMaybe(norm16);
1414 if( onlyContiguous && // FCC
1415 cc!=0 &&
1416 prevCC==0 &&
1417 prevBoundary<prevSrc &&
1418 // prevCC==0 && prevBoundary<prevSrc tell us that
1419 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1420 // passed the quick check "yes && ccc==0" test.
1421 // Check whether the last character was a "yesYes" or a "yesNo".
1422 // If a "yesNo", then we get its trailing ccc from its
1423 // mapping and check for canonical order.
1424 // All other cases are ok.
1425 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1426 ) {
1427 // Fails FCD test.
1428 } else if(prevCC<=cc || cc==0) {
1429 prevCC=cc;
1430 if(norm16<MIN_YES_YES_WITH_CC) {
1431 if(pQCResult!=NULL) {
1432 *pQCResult=UNORM_MAYBE;
1433 } else {
1434 return prevBoundary;
1435 }
1436 }
1437 continue;
1438 }
1439 }
1440 if(pQCResult!=NULL) {
1441 *pQCResult=UNORM_NO;
1442 }
1443 return prevBoundary;
1444 }
1445 }
1446
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1447 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1448 UBool doCompose,
1449 UBool onlyContiguous,
1450 UnicodeString &safeMiddle,
1451 ReorderingBuffer &buffer,
1452 UErrorCode &errorCode) const {
1453 if(!buffer.isEmpty()) {
1454 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1455 if(src!=firstStarterInSrc) {
1456 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1457 buffer.getLimit());
1458 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1459 UnicodeString middle(lastStarterInDest, destSuffixLength);
1460 buffer.removeSuffix(destSuffixLength);
1461 safeMiddle=middle;
1462 middle.append(src, (int32_t)(firstStarterInSrc-src));
1463 const UChar *middleStart=middle.getBuffer();
1464 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1465 TRUE, buffer, errorCode);
1466 if(U_FAILURE(errorCode)) {
1467 return;
1468 }
1469 src=firstStarterInSrc;
1470 }
1471 }
1472 if(doCompose) {
1473 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1474 } else {
1475 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1476 limit=u_strchr(src, 0);
1477 }
1478 buffer.appendZeroCC(src, limit, errorCode);
1479 }
1480 }
1481
1482 /**
1483 * Does c have a composition boundary before it?
1484 * True if its decomposition begins with a character that has
1485 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1486 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1487 * (isCompYesAndZeroCC()) so we need not decompose.
1488 */
hasCompBoundaryBefore(UChar32 c,uint16_t norm16) const1489 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1490 for(;;) {
1491 if(isCompYesAndZeroCC(norm16)) {
1492 return TRUE;
1493 } else if(isMaybeOrNonZeroCC(norm16)) {
1494 return FALSE;
1495 } else if(isDecompNoAlgorithmic(norm16)) {
1496 c=mapAlgorithmic(c, norm16);
1497 norm16=getNorm16(c);
1498 } else {
1499 // c decomposes, get everything from the variable-length extra data
1500 const uint16_t *mapping=getMapping(norm16);
1501 uint16_t firstUnit=*mapping;
1502 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1503 return FALSE;
1504 }
1505 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
1506 return FALSE; // non-zero leadCC
1507 }
1508 int32_t i=1; // skip over the firstUnit
1509 UChar32 c;
1510 U16_NEXT_UNSAFE(mapping, i, c);
1511 return isCompYesAndZeroCC(getNorm16(c));
1512 }
1513 }
1514 }
1515
hasCompBoundaryAfter(UChar32 c,UBool onlyContiguous,UBool testInert) const1516 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1517 for(;;) {
1518 uint16_t norm16=getNorm16(c);
1519 if(isInert(norm16)) {
1520 return TRUE;
1521 } else if(norm16<=minYesNo) {
1522 // Hangul: norm16==minYesNo
1523 // Hangul LVT has a boundary after it.
1524 // Hangul LV and non-inert yesYes characters combine forward.
1525 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1526 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1527 return FALSE;
1528 } else if(isDecompNoAlgorithmic(norm16)) {
1529 c=mapAlgorithmic(c, norm16);
1530 } else {
1531 // c decomposes, get everything from the variable-length extra data.
1532 // If testInert, then c must be a yesNo character which has lccc=0,
1533 // otherwise it could be a noNo.
1534 const uint16_t *mapping=getMapping(norm16);
1535 uint16_t firstUnit=*mapping;
1536 // TRUE if
1537 // not MAPPING_NO_COMP_BOUNDARY_AFTER
1538 // (which is set if
1539 // c is not deleted, and
1540 // it and its decomposition do not combine forward, and it has a starter)
1541 // and if FCC then trailCC<=1
1542 return
1543 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1544 (!onlyContiguous || firstUnit<=0x1ff);
1545 }
1546 }
1547 }
1548
findPreviousCompBoundary(const UChar * start,const UChar * p) const1549 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1550 BackwardUTrie2StringIterator iter(normTrie, start, p);
1551 uint16_t norm16;
1552 do {
1553 norm16=iter.previous16();
1554 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1555 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1556 // but that's probably not worth the extra cost.
1557 return iter.codePointStart;
1558 }
1559
findNextCompBoundary(const UChar * p,const UChar * limit) const1560 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1561 ForwardUTrie2StringIterator iter(normTrie, p, limit);
1562 uint16_t norm16;
1563 do {
1564 norm16=iter.next16();
1565 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1566 return iter.codePointStart;
1567 }
1568
1569 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
1570 // still had getFCDTrie() which built and cached an FCD trie.
1571 // That provided faster access to FCD data than getFCD16FromNormData()
1572 // but required synchronization and consumed some 10kB of heap memory
1573 // in any process that uses FCD (e.g., via collation).
1574 // tccc180[] and smallFCD[] are intended to help with any loss of performance,
1575 // at least for Latin & CJK.
1576
1577 // Gets the FCD value from the regular normalization data.
getFCD16FromNormData(UChar32 c) const1578 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
1579 // Only loops for 1:1 algorithmic mappings.
1580 for(;;) {
1581 uint16_t norm16=getNorm16(c);
1582 if(norm16<=minYesNo) {
1583 // no decomposition or Hangul syllable, all zeros
1584 return 0;
1585 } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
1586 // combining mark
1587 norm16&=0xff;
1588 return norm16|(norm16<<8);
1589 } else if(norm16>=minMaybeYes) {
1590 return 0;
1591 } else if(isDecompNoAlgorithmic(norm16)) {
1592 c=mapAlgorithmic(c, norm16);
1593 } else {
1594 // c decomposes, get everything from the variable-length extra data
1595 const uint16_t *mapping=getMapping(norm16);
1596 uint16_t firstUnit=*mapping;
1597 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1598 // A character that is deleted (maps to an empty string) must
1599 // get the worst-case lccc and tccc values because arbitrary
1600 // characters on both sides will become adjacent.
1601 return 0x1ff;
1602 } else {
1603 norm16=firstUnit>>8; // tccc
1604 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1605 norm16|=*(mapping-1)&0xff00; // lccc
1606 }
1607 return norm16;
1608 }
1609 }
1610 }
1611 }
1612
1613 // Dual functionality:
1614 // buffer!=NULL: normalize
1615 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1616 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const1617 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1618 ReorderingBuffer *buffer,
1619 UErrorCode &errorCode) const {
1620 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1621 // Similar to the prevBoundary in the compose() implementation.
1622 const UChar *prevBoundary=src;
1623 int32_t prevFCD16=0;
1624 if(limit==NULL) {
1625 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1626 if(U_FAILURE(errorCode)) {
1627 return src;
1628 }
1629 if(prevBoundary<src) {
1630 prevBoundary=src;
1631 // We know that the previous character's lccc==0.
1632 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1633 prevFCD16=getFCD16(*(src-1));
1634 if(prevFCD16>1) {
1635 --prevBoundary;
1636 }
1637 }
1638 limit=u_strchr(src, 0);
1639 }
1640
1641 // Note: In this function we use buffer->appendZeroCC() because we track
1642 // the lead and trail combining classes here, rather than leaving it to
1643 // the ReorderingBuffer.
1644 // The exception is the call to decomposeShort() which uses the buffer
1645 // in the normal way.
1646
1647 const UChar *prevSrc;
1648 UChar32 c=0;
1649 uint16_t fcd16=0;
1650
1651 for(;;) {
1652 // count code units with lccc==0
1653 for(prevSrc=src; src!=limit;) {
1654 if((c=*src)<MIN_CCC_LCCC_CP) {
1655 prevFCD16=~c;
1656 ++src;
1657 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1658 prevFCD16=0;
1659 ++src;
1660 } else {
1661 if(U16_IS_SURROGATE(c)) {
1662 UChar c2;
1663 if(U16_IS_SURROGATE_LEAD(c)) {
1664 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1665 c=U16_GET_SUPPLEMENTARY(c, c2);
1666 }
1667 } else /* trail surrogate */ {
1668 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1669 --src;
1670 c=U16_GET_SUPPLEMENTARY(c2, c);
1671 }
1672 }
1673 }
1674 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1675 prevFCD16=fcd16;
1676 src+=U16_LENGTH(c);
1677 } else {
1678 break;
1679 }
1680 }
1681 }
1682 // copy these code units all at once
1683 if(src!=prevSrc) {
1684 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1685 break;
1686 }
1687 if(src==limit) {
1688 break;
1689 }
1690 prevBoundary=src;
1691 // We know that the previous character's lccc==0.
1692 if(prevFCD16<0) {
1693 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1694 UChar32 prev=~prevFCD16;
1695 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1696 if(prevFCD16>1) {
1697 --prevBoundary;
1698 }
1699 } else {
1700 const UChar *p=src-1;
1701 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1702 --p;
1703 // Need to fetch the previous character's FCD value because
1704 // prevFCD16 was just for the trail surrogate code point.
1705 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
1706 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1707 }
1708 if(prevFCD16>1) {
1709 prevBoundary=p;
1710 }
1711 }
1712 // The start of the current character (c).
1713 prevSrc=src;
1714 } else if(src==limit) {
1715 break;
1716 }
1717
1718 src+=U16_LENGTH(c);
1719 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1720 // Check for proper order, and decompose locally if necessary.
1721 if((prevFCD16&0xff)<=(fcd16>>8)) {
1722 // proper order: prev tccc <= current lccc
1723 if((fcd16&0xff)<=1) {
1724 prevBoundary=src;
1725 }
1726 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1727 break;
1728 }
1729 prevFCD16=fcd16;
1730 continue;
1731 } else if(buffer==NULL) {
1732 return prevBoundary; // quick check "no"
1733 } else {
1734 /*
1735 * Back out the part of the source that we copied or appended
1736 * already but is now going to be decomposed.
1737 * prevSrc is set to after what was copied/appended.
1738 */
1739 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1740 /*
1741 * Find the part of the source that needs to be decomposed,
1742 * up to the next safe boundary.
1743 */
1744 src=findNextFCDBoundary(src, limit);
1745 /*
1746 * The source text does not fulfill the conditions for FCD.
1747 * Decompose and reorder a limited piece of the text.
1748 */
1749 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1750 break;
1751 }
1752 prevBoundary=src;
1753 prevFCD16=0;
1754 }
1755 }
1756 return src;
1757 }
1758
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1759 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1760 UBool doMakeFCD,
1761 UnicodeString &safeMiddle,
1762 ReorderingBuffer &buffer,
1763 UErrorCode &errorCode) const {
1764 if(!buffer.isEmpty()) {
1765 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1766 if(src!=firstBoundaryInSrc) {
1767 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1768 buffer.getLimit());
1769 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1770 UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1771 buffer.removeSuffix(destSuffixLength);
1772 safeMiddle=middle;
1773 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1774 const UChar *middleStart=middle.getBuffer();
1775 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1776 if(U_FAILURE(errorCode)) {
1777 return;
1778 }
1779 src=firstBoundaryInSrc;
1780 }
1781 }
1782 if(doMakeFCD) {
1783 makeFCD(src, limit, &buffer, errorCode);
1784 } else {
1785 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1786 limit=u_strchr(src, 0);
1787 }
1788 buffer.appendZeroCC(src, limit, errorCode);
1789 }
1790 }
1791
findPreviousFCDBoundary(const UChar * start,const UChar * p) const1792 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1793 while(start<p && previousFCD16(start, p)>0xff) {}
1794 return p;
1795 }
1796
findNextFCDBoundary(const UChar * p,const UChar * limit) const1797 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1798 while(p<limit) {
1799 const UChar *codePointStart=p;
1800 if(nextFCD16(p, limit)<=0xff) {
1801 return codePointStart;
1802 }
1803 }
1804 return p;
1805 }
1806
1807 // CanonicalIterator data -------------------------------------------------- ***
1808
CanonIterData(UErrorCode & errorCode)1809 CanonIterData::CanonIterData(UErrorCode &errorCode) :
1810 trie(utrie2_open(0, 0, &errorCode)),
1811 canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
1812
~CanonIterData()1813 CanonIterData::~CanonIterData() {
1814 utrie2_close(trie);
1815 }
1816
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)1817 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1818 uint32_t canonValue=utrie2_get32(trie, decompLead);
1819 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1820 // origin is the first character whose decomposition starts with
1821 // the character for which we are setting the value.
1822 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1823 } else {
1824 // origin is not the first character, or it is U+0000.
1825 UnicodeSet *set;
1826 if((canonValue&CANON_HAS_SET)==0) {
1827 set=new UnicodeSet;
1828 if(set==NULL) {
1829 errorCode=U_MEMORY_ALLOCATION_ERROR;
1830 return;
1831 }
1832 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1833 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1834 utrie2_set32(trie, decompLead, canonValue, &errorCode);
1835 canonStartSets.addElement(set, errorCode);
1836 if(firstOrigin!=0) {
1837 set->add(firstOrigin);
1838 }
1839 } else {
1840 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1841 }
1842 set->add(origin);
1843 }
1844 }
1845
1846 U_CDECL_BEGIN
1847
1848 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1849 // context: the Normalizer2Impl
1850 static UBool U_CALLCONV
enumCIDRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)1851 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1852 UErrorCode errorCode = U_ZERO_ERROR;
1853 if (value != 0) {
1854 Normalizer2Impl *impl = (Normalizer2Impl *)context;
1855 impl->makeCanonIterDataFromNorm16(
1856 start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
1857 }
1858 return U_SUCCESS(errorCode);
1859 }
1860
1861
1862
1863 // UInitOnce instantiation function for CanonIterData
1864
1865 static void U_CALLCONV
initCanonIterData(Normalizer2Impl * impl,UErrorCode & errorCode)1866 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
1867 U_ASSERT(impl->fCanonIterData == NULL);
1868 impl->fCanonIterData = new CanonIterData(errorCode);
1869 if (impl->fCanonIterData == NULL) {
1870 errorCode=U_MEMORY_ALLOCATION_ERROR;
1871 }
1872 if (U_SUCCESS(errorCode)) {
1873 utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
1874 utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1875 }
1876 if (U_FAILURE(errorCode)) {
1877 delete impl->fCanonIterData;
1878 impl->fCanonIterData = NULL;
1879 }
1880 }
1881
1882 U_CDECL_END
1883
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const1884 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1885 CanonIterData &newData,
1886 UErrorCode &errorCode) const {
1887 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1888 // Inert, or 2-way mapping (including Hangul syllable).
1889 // We do not write a canonStartSet for any yesNo character.
1890 // Composites from 2-way mappings are added at runtime from the
1891 // starter's compositions list, and the other characters in
1892 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1893 // "maybe" characters.
1894 return;
1895 }
1896 for(UChar32 c=start; c<=end; ++c) {
1897 uint32_t oldValue=utrie2_get32(newData.trie, c);
1898 uint32_t newValue=oldValue;
1899 if(norm16>=minMaybeYes) {
1900 // not a segment starter if it occurs in a decomposition or has cc!=0
1901 newValue|=CANON_NOT_SEGMENT_STARTER;
1902 if(norm16<MIN_NORMAL_MAYBE_YES) {
1903 newValue|=CANON_HAS_COMPOSITIONS;
1904 }
1905 } else if(norm16<minYesNo) {
1906 newValue|=CANON_HAS_COMPOSITIONS;
1907 } else {
1908 // c has a one-way decomposition
1909 UChar32 c2=c;
1910 uint16_t norm16_2=norm16;
1911 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1912 c2=mapAlgorithmic(c2, norm16_2);
1913 norm16_2=getNorm16(c2);
1914 }
1915 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1916 // c decomposes, get everything from the variable-length extra data
1917 const uint16_t *mapping=getMapping(norm16_2);
1918 uint16_t firstUnit=*mapping;
1919 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1920 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1921 if(c==c2 && (*(mapping-1)&0xff)!=0) {
1922 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
1923 }
1924 }
1925 // Skip empty mappings (no characters in the decomposition).
1926 if(length!=0) {
1927 ++mapping; // skip over the firstUnit
1928 // add c to first code point's start set
1929 int32_t i=0;
1930 U16_NEXT_UNSAFE(mapping, i, c2);
1931 newData.addToStartSet(c, c2, errorCode);
1932 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1933 // one-way mapping. A 2-way mapping is possible here after
1934 // intermediate algorithmic mapping.
1935 if(norm16_2>=minNoNo) {
1936 while(i<length) {
1937 U16_NEXT_UNSAFE(mapping, i, c2);
1938 uint32_t c2Value=utrie2_get32(newData.trie, c2);
1939 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1940 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1941 &errorCode);
1942 }
1943 }
1944 }
1945 }
1946 } else {
1947 // c decomposed to c2 algorithmically; c has cc==0
1948 newData.addToStartSet(c, c2, errorCode);
1949 }
1950 }
1951 if(newValue!=oldValue) {
1952 utrie2_set32(newData.trie, c, newValue, &errorCode);
1953 }
1954 }
1955 }
1956
ensureCanonIterData(UErrorCode & errorCode) const1957 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1958 // Logically const: Synchronized instantiation.
1959 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1960 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
1961 return U_SUCCESS(errorCode);
1962 }
1963
getCanonValue(UChar32 c) const1964 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1965 return (int32_t)utrie2_get32(fCanonIterData->trie, c);
1966 }
1967
getCanonStartSet(int32_t n) const1968 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1969 return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
1970 }
1971
isCanonSegmentStarter(UChar32 c) const1972 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1973 return getCanonValue(c)>=0;
1974 }
1975
getCanonStartSet(UChar32 c,UnicodeSet & set) const1976 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1977 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1978 if(canonValue==0) {
1979 return FALSE;
1980 }
1981 set.clear();
1982 int32_t value=canonValue&CANON_VALUE_MASK;
1983 if((canonValue&CANON_HAS_SET)!=0) {
1984 set.addAll(getCanonStartSet(value));
1985 } else if(value!=0) {
1986 set.add(value);
1987 }
1988 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1989 uint16_t norm16=getNorm16(c);
1990 if(norm16==JAMO_L) {
1991 UChar32 syllable=
1992 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1993 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1994 } else {
1995 addComposites(getCompositionsList(norm16), set);
1996 }
1997 }
1998 return TRUE;
1999 }
2000
2001 U_NAMESPACE_END
2002
2003 // Normalizer2 data swapping ----------------------------------------------- ***
2004
2005 U_NAMESPACE_USE
2006
2007 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)2008 unorm2_swap(const UDataSwapper *ds,
2009 const void *inData, int32_t length, void *outData,
2010 UErrorCode *pErrorCode) {
2011 const UDataInfo *pInfo;
2012 int32_t headerSize;
2013
2014 const uint8_t *inBytes;
2015 uint8_t *outBytes;
2016
2017 const int32_t *inIndexes;
2018 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
2019
2020 int32_t i, offset, nextOffset, size;
2021
2022 /* udata_swapDataHeader checks the arguments */
2023 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2024 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2025 return 0;
2026 }
2027
2028 /* check data format and format version */
2029 pInfo=(const UDataInfo *)((const char *)inData+4);
2030 if(!(
2031 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
2032 pInfo->dataFormat[1]==0x72 &&
2033 pInfo->dataFormat[2]==0x6d &&
2034 pInfo->dataFormat[3]==0x32 &&
2035 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
2036 )) {
2037 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2038 pInfo->dataFormat[0], pInfo->dataFormat[1],
2039 pInfo->dataFormat[2], pInfo->dataFormat[3],
2040 pInfo->formatVersion[0]);
2041 *pErrorCode=U_UNSUPPORTED_ERROR;
2042 return 0;
2043 }
2044
2045 inBytes=(const uint8_t *)inData+headerSize;
2046 outBytes=(uint8_t *)outData+headerSize;
2047
2048 inIndexes=(const int32_t *)inBytes;
2049
2050 if(length>=0) {
2051 length-=headerSize;
2052 if(length<(int32_t)sizeof(indexes)) {
2053 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2054 length);
2055 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2056 return 0;
2057 }
2058 }
2059
2060 /* read the first few indexes */
2061 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
2062 indexes[i]=udata_readInt32(ds, inIndexes[i]);
2063 }
2064
2065 /* get the total length of the data */
2066 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2067
2068 if(length>=0) {
2069 if(length<size) {
2070 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2071 length);
2072 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2073 return 0;
2074 }
2075
2076 /* copy the data for inaccessible bytes */
2077 if(inBytes!=outBytes) {
2078 uprv_memcpy(outBytes, inBytes, size);
2079 }
2080
2081 offset=0;
2082
2083 /* swap the int32_t indexes[] */
2084 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2085 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2086 offset=nextOffset;
2087
2088 /* swap the UTrie2 */
2089 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2090 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2091 offset=nextOffset;
2092
2093 /* swap the uint16_t extraData[] */
2094 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2095 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2096 offset=nextOffset;
2097
2098 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2099 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2100 offset=nextOffset;
2101
2102 U_ASSERT(offset==size);
2103 }
2104
2105 return headerSize+size;
2106 }
2107
2108 #endif // !UCONFIG_NO_NORMALIZATION
2109