1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2impl.h 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __NORMALIZER2IMPL_H__ 20 #define __NORMALIZER2IMPL_H__ 21 22 #include "unicode/utypes.h" 23 24 #if !UCONFIG_NO_NORMALIZATION 25 26 #include "unicode/normalizer2.h" 27 #include "unicode/unistr.h" 28 #include "unicode/unorm.h" 29 #include "unicode/utf16.h" 30 #include "mutex.h" 31 #include "uset_imp.h" 32 #include "utrie2.h" 33 34 U_NAMESPACE_BEGIN 35 36 struct CanonIterData; 37 38 class U_COMMON_API Hangul { 39 public: 40 /* Korean Hangul and Jamo constants */ 41 enum { 42 JAMO_L_BASE=0x1100, /* "lead" jamo */ 43 JAMO_L_END=0x1112, 44 JAMO_V_BASE=0x1161, /* "vowel" jamo */ 45 JAMO_V_END=0x1175, 46 JAMO_T_BASE=0x11a7, /* "trail" jamo */ 47 JAMO_T_END=0x11c2, 48 49 HANGUL_BASE=0xac00, 50 HANGUL_END=0xd7a3, 51 52 JAMO_L_COUNT=19, 53 JAMO_V_COUNT=21, 54 JAMO_T_COUNT=28, 55 56 JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, 57 58 HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, 59 HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT 60 }; 61 isHangul(UChar32 c)62 static inline UBool isHangul(UChar32 c) { 63 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 64 } 65 static inline UBool isHangulWithoutJamoT(UChar c)66 isHangulWithoutJamoT(UChar c) { 67 c-=HANGUL_BASE; 68 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 69 } isJamoL(UChar32 c)70 static inline UBool isJamoL(UChar32 c) { 71 return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT; 72 } isJamoV(UChar32 c)73 static inline UBool isJamoV(UChar32 c) { 74 return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT; 75 } 76 77 /** 78 * Decomposes c, which must be a Hangul syllable, into buffer 79 * and returns the length of the decomposition (2 or 3). 80 */ decompose(UChar32 c,UChar buffer[3])81 static inline int32_t decompose(UChar32 c, UChar buffer[3]) { 82 c-=HANGUL_BASE; 83 UChar32 c2=c%JAMO_T_COUNT; 84 c/=JAMO_T_COUNT; 85 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 86 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 87 if(c2==0) { 88 return 2; 89 } else { 90 buffer[2]=(UChar)(JAMO_T_BASE+c2); 91 return 3; 92 } 93 } 94 95 /** 96 * Decomposes c, which must be a Hangul syllable, into buffer. 97 * This is the raw, not recursive, decomposition. Its length is always 2. 98 */ getRawDecomposition(UChar32 c,UChar buffer[2])99 static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) { 100 UChar32 orig=c; 101 c-=HANGUL_BASE; 102 UChar32 c2=c%JAMO_T_COUNT; 103 if(c2==0) { 104 c/=JAMO_T_COUNT; 105 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 106 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 107 } else { 108 buffer[0]=orig-c2; // LV syllable 109 buffer[1]=(UChar)(JAMO_T_BASE+c2); 110 } 111 } 112 private: 113 Hangul(); // no instantiation 114 }; 115 116 class Normalizer2Impl; 117 118 class U_COMMON_API ReorderingBuffer : public UMemory { 119 public: ReorderingBuffer(const Normalizer2Impl & ni,UnicodeString & dest)120 ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : 121 impl(ni), str(dest), 122 start(NULL), reorderStart(NULL), limit(NULL), 123 remainingCapacity(0), lastCC(0) {} ~ReorderingBuffer()124 ~ReorderingBuffer() { 125 if(start!=NULL) { 126 str.releaseBuffer((int32_t)(limit-start)); 127 } 128 } 129 UBool init(int32_t destCapacity, UErrorCode &errorCode); 130 isEmpty()131 UBool isEmpty() const { return start==limit; } length()132 int32_t length() const { return (int32_t)(limit-start); } getStart()133 UChar *getStart() { return start; } getLimit()134 UChar *getLimit() { return limit; } getLastCC()135 uint8_t getLastCC() const { return lastCC; } 136 137 UBool equals(const UChar *start, const UChar *limit) const; 138 139 // For Hangul composition, replacing the Leading consonant Jamo with the syllable. setLastChar(UChar c)140 void setLastChar(UChar c) { 141 *(limit-1)=c; 142 } 143 append(UChar32 c,uint8_t cc,UErrorCode & errorCode)144 UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 145 return (c<=0xffff) ? 146 appendBMP((UChar)c, cc, errorCode) : 147 appendSupplementary(c, cc, errorCode); 148 } 149 // s must be in NFD, otherwise change the implementation. 150 UBool append(const UChar *s, int32_t length, 151 uint8_t leadCC, uint8_t trailCC, 152 UErrorCode &errorCode); appendBMP(UChar c,uint8_t cc,UErrorCode & errorCode)153 UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) { 154 if(remainingCapacity==0 && !resize(1, errorCode)) { 155 return FALSE; 156 } 157 if(lastCC<=cc || cc==0) { 158 *limit++=c; 159 lastCC=cc; 160 if(cc<=1) { 161 reorderStart=limit; 162 } 163 } else { 164 insert(c, cc); 165 } 166 --remainingCapacity; 167 return TRUE; 168 } 169 UBool appendZeroCC(UChar32 c, UErrorCode &errorCode); 170 UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode); 171 void remove(); 172 void removeSuffix(int32_t suffixLength); setReorderingLimit(UChar * newLimit)173 void setReorderingLimit(UChar *newLimit) { 174 remainingCapacity+=(int32_t)(limit-newLimit); 175 reorderStart=limit=newLimit; 176 lastCC=0; 177 } copyReorderableSuffixTo(UnicodeString & s)178 void copyReorderableSuffixTo(UnicodeString &s) const { 179 s.setTo(reorderStart, (int32_t)(limit-reorderStart)); 180 } 181 private: 182 /* 183 * TODO: Revisit whether it makes sense to track reorderStart. 184 * It is set to after the last known character with cc<=1, 185 * which stops previousCC() before it reads that character and looks up its cc. 186 * previousCC() is normally only called from insert(). 187 * In other words, reorderStart speeds up the insertion of a combining mark 188 * into a multi-combining mark sequence where it does not belong at the end. 189 * This might not be worth the trouble. 190 * On the other hand, it's not a huge amount of trouble. 191 * 192 * We probably need it for UNORM_SIMPLE_APPEND. 193 */ 194 195 UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode); 196 void insert(UChar32 c, uint8_t cc); writeCodePoint(UChar * p,UChar32 c)197 static void writeCodePoint(UChar *p, UChar32 c) { 198 if(c<=0xffff) { 199 *p=(UChar)c; 200 } else { 201 p[0]=U16_LEAD(c); 202 p[1]=U16_TRAIL(c); 203 } 204 } 205 UBool resize(int32_t appendLength, UErrorCode &errorCode); 206 207 const Normalizer2Impl &impl; 208 UnicodeString &str; 209 UChar *start, *reorderStart, *limit; 210 int32_t remainingCapacity; 211 uint8_t lastCC; 212 213 // private backward iterator setIterator()214 void setIterator() { codePointStart=limit; } 215 void skipPrevious(); // Requires start<codePointStart. 216 uint8_t previousCC(); // Returns 0 if there is no previous character. 217 218 UChar *codePointStart, *codePointLimit; 219 }; 220 221 class U_COMMON_API Normalizer2Impl : public UObject { 222 public: Normalizer2Impl()223 Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) { 224 fCanonIterDataInitOnce.reset(); 225 } 226 virtual ~Normalizer2Impl(); 227 228 void init(const int32_t *inIndexes, const UTrie2 *inTrie, 229 const uint16_t *inExtraData, const uint8_t *inSmallFCD); 230 231 void addLcccChars(UnicodeSet &set) const; 232 void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 233 void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 234 235 // low-level properties ------------------------------------------------ *** 236 getNormTrie()237 const UTrie2 *getNormTrie() const { return normTrie; } 238 239 UBool ensureCanonIterData(UErrorCode &errorCode) const; 240 getNorm16(UChar32 c)241 uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); } 242 getCompQuickCheck(uint16_t norm16)243 UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { 244 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 245 return UNORM_YES; 246 } else if(minMaybeYes<=norm16) { 247 return UNORM_MAYBE; 248 } else { 249 return UNORM_NO; 250 } 251 } isAlgorithmicNoNo(uint16_t norm16)252 UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeYes; } isCompNo(uint16_t norm16)253 UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; } isDecompYes(uint16_t norm16)254 UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; } 255 getCC(uint16_t norm16)256 uint8_t getCC(uint16_t norm16) const { 257 if(norm16>=MIN_NORMAL_MAYBE_YES) { 258 return (uint8_t)norm16; 259 } 260 if(norm16<minNoNo || limitNoNo<=norm16) { 261 return 0; 262 } 263 return getCCFromNoNo(norm16); 264 } getCCFromYesOrMaybe(uint16_t norm16)265 static uint8_t getCCFromYesOrMaybe(uint16_t norm16) { 266 return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; 267 } 268 269 /** 270 * Returns the FCD data for code point c. 271 * @param c A Unicode code point. 272 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 273 */ getFCD16(UChar32 c)274 uint16_t getFCD16(UChar32 c) const { 275 if(c<0) { 276 return 0; 277 } else if(c<0x180) { 278 return tccc180[c]; 279 } else if(c<=0xffff) { 280 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 281 } 282 return getFCD16FromNormData(c); 283 } 284 /** 285 * Returns the FCD data for the next code point (post-increment). 286 * Might skip only a lead surrogate rather than the whole surrogate pair if none of 287 * the supplementary code points associated with the lead surrogate have non-zero FCD data. 288 * @param s A valid pointer into a string. Requires s!=limit. 289 * @param limit The end of the string, or NULL. 290 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 291 */ nextFCD16(const UChar * & s,const UChar * limit)292 uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { 293 UChar32 c=*s++; 294 if(c<0x180) { 295 return tccc180[c]; 296 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 297 return 0; 298 } 299 UChar c2; 300 if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { 301 c=U16_GET_SUPPLEMENTARY(c, c2); 302 ++s; 303 } 304 return getFCD16FromNormData(c); 305 } 306 /** 307 * Returns the FCD data for the previous code point (pre-decrement). 308 * @param start The start of the string. 309 * @param s A valid pointer into a string. Requires start<s. 310 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 311 */ previousFCD16(const UChar * start,const UChar * & s)312 uint16_t previousFCD16(const UChar *start, const UChar *&s) const { 313 UChar32 c=*--s; 314 if(c<0x180) { 315 return tccc180[c]; 316 } 317 if(!U16_IS_TRAIL(c)) { 318 if(!singleLeadMightHaveNonZeroFCD16(c)) { 319 return 0; 320 } 321 } else { 322 UChar c2; 323 if(start<s && U16_IS_LEAD(c2=*(s-1))) { 324 c=U16_GET_SUPPLEMENTARY(c2, c); 325 --s; 326 } 327 } 328 return getFCD16FromNormData(c); 329 } 330 331 /** Returns the FCD data for U+0000<=c<U+0180. */ getFCD16FromBelow180(UChar32 c)332 uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; } 333 /** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */ singleLeadMightHaveNonZeroFCD16(UChar32 lead)334 UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const { 335 // 0<=lead<=0xffff 336 uint8_t bits=smallFCD[lead>>8]; 337 if(bits==0) { return false; } 338 return (UBool)((bits>>((lead>>5)&7))&1); 339 } 340 /** Returns the FCD value from the regular normalization data. */ 341 uint16_t getFCD16FromNormData(UChar32 c) const; 342 343 void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 344 CanonIterData &newData, UErrorCode &errorCode) const; 345 346 /** 347 * Gets the decomposition for one code point. 348 * @param c code point 349 * @param buffer out-only buffer for algorithmic decompositions 350 * @param length out-only, takes the length of the decomposition, if any 351 * @return pointer to the decomposition, or NULL if none 352 */ 353 const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; 354 355 /** 356 * Gets the raw decomposition for one code point. 357 * @param c code point 358 * @param buffer out-only buffer for algorithmic decompositions 359 * @param length out-only, takes the length of the decomposition, if any 360 * @return pointer to the decomposition, or NULL if none 361 */ 362 const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; 363 364 UChar32 composePair(UChar32 a, UChar32 b) const; 365 366 UBool isCanonSegmentStarter(UChar32 c) const; 367 UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; 368 369 enum { 370 MIN_CCC_LCCC_CP=0x300 371 }; 372 373 enum { 374 MIN_YES_YES_WITH_CC=0xff01, 375 JAMO_VT=0xff00, 376 MIN_NORMAL_MAYBE_YES=0xfe00, 377 JAMO_L=1, 378 MAX_DELTA=0x40 379 }; 380 381 enum { 382 // Byte offsets from the start of the data, after the generic header. 383 IX_NORM_TRIE_OFFSET, 384 IX_EXTRA_DATA_OFFSET, 385 IX_SMALL_FCD_OFFSET, 386 IX_RESERVED3_OFFSET, 387 IX_RESERVED4_OFFSET, 388 IX_RESERVED5_OFFSET, 389 IX_RESERVED6_OFFSET, 390 IX_TOTAL_SIZE, 391 392 // Code point thresholds for quick check codes. 393 IX_MIN_DECOMP_NO_CP, 394 IX_MIN_COMP_NO_MAYBE_CP, 395 396 // Norm16 value thresholds for quick check combinations and types of extra data. 397 IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 398 IX_MIN_NO_NO, 399 IX_LIMIT_NO_NO, 400 IX_MIN_MAYBE_YES, 401 402 IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[. 403 404 IX_RESERVED15, 405 IX_COUNT 406 }; 407 408 enum { 409 MAPPING_HAS_CCC_LCCC_WORD=0x80, 410 MAPPING_HAS_RAW_MAPPING=0x40, 411 MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, 412 MAPPING_LENGTH_MASK=0x1f 413 }; 414 415 enum { 416 COMP_1_LAST_TUPLE=0x8000, 417 COMP_1_TRIPLE=1, 418 COMP_1_TRAIL_LIMIT=0x3400, 419 COMP_1_TRAIL_MASK=0x7ffe, 420 COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit 421 COMP_2_TRAIL_SHIFT=6, 422 COMP_2_TRAIL_MASK=0xffc0 423 }; 424 425 // higher-level functionality ------------------------------------------ *** 426 427 // NFD without an NFD Normalizer2 instance. 428 UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, 429 UErrorCode &errorCode) const; 430 /** 431 * Decomposes [src, limit[ and writes the result to dest. 432 * limit can be NULL if src is NUL-terminated. 433 * destLengthEstimate is the initial dest buffer capacity and can be -1. 434 */ 435 void decompose(const UChar *src, const UChar *limit, 436 UnicodeString &dest, int32_t destLengthEstimate, 437 UErrorCode &errorCode) const; 438 439 const UChar *decompose(const UChar *src, const UChar *limit, 440 ReorderingBuffer *buffer, UErrorCode &errorCode) const; 441 void decomposeAndAppend(const UChar *src, const UChar *limit, 442 UBool doDecompose, 443 UnicodeString &safeMiddle, 444 ReorderingBuffer &buffer, 445 UErrorCode &errorCode) const; 446 UBool compose(const UChar *src, const UChar *limit, 447 UBool onlyContiguous, 448 UBool doCompose, 449 ReorderingBuffer &buffer, 450 UErrorCode &errorCode) const; 451 const UChar *composeQuickCheck(const UChar *src, const UChar *limit, 452 UBool onlyContiguous, 453 UNormalizationCheckResult *pQCResult) const; 454 void composeAndAppend(const UChar *src, const UChar *limit, 455 UBool doCompose, 456 UBool onlyContiguous, 457 UnicodeString &safeMiddle, 458 ReorderingBuffer &buffer, 459 UErrorCode &errorCode) const; 460 const UChar *makeFCD(const UChar *src, const UChar *limit, 461 ReorderingBuffer *buffer, UErrorCode &errorCode) const; 462 void makeFCDAndAppend(const UChar *src, const UChar *limit, 463 UBool doMakeFCD, 464 UnicodeString &safeMiddle, 465 ReorderingBuffer &buffer, 466 UErrorCode &errorCode) const; 467 468 UBool hasDecompBoundary(UChar32 c, UBool before) const; isDecompInert(UChar32 c)469 UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } 470 hasCompBoundaryBefore(UChar32 c)471 UBool hasCompBoundaryBefore(UChar32 c) const { 472 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 473 } 474 UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const; 475 hasFCDBoundaryBefore(UChar32 c)476 UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } hasFCDBoundaryAfter(UChar32 c)477 UBool hasFCDBoundaryAfter(UChar32 c) const { 478 uint16_t fcd16=getFCD16(c); 479 return fcd16<=1 || (fcd16&0xff)==0; 480 } isFCDInert(UChar32 c)481 UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } 482 private: isMaybe(uint16_t norm16)483 UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } isMaybeOrNonZeroCC(uint16_t norm16)484 UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; } isInert(uint16_t norm16)485 static UBool isInert(uint16_t norm16) { return norm16==0; } isJamoL(uint16_t norm16)486 static UBool isJamoL(uint16_t norm16) { return norm16==1; } isJamoVT(uint16_t norm16)487 static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } isHangul(uint16_t norm16)488 UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } isCompYesAndZeroCC(uint16_t norm16)489 UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; } 490 // UBool isCompYes(uint16_t norm16) const { 491 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 492 // } 493 // UBool isCompYesOrMaybe(uint16_t norm16) const { 494 // return norm16<minNoNo || minMaybeYes<=norm16; 495 // } 496 // UBool hasZeroCCFromDecompYes(uint16_t norm16) const { 497 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 498 // } isDecompYesAndZeroCC(uint16_t norm16)499 UBool isDecompYesAndZeroCC(uint16_t norm16) const { 500 return norm16<minYesNo || 501 norm16==JAMO_VT || 502 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 503 } 504 /** 505 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 506 * the MaybeYes which combine-forward and have ccc=0. 507 * (Standard Unicode 5.2 normalization does not have such characters.) 508 */ isMostDecompYesAndZeroCC(uint16_t norm16)509 UBool isMostDecompYesAndZeroCC(uint16_t norm16) const { 510 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 511 } isDecompNoAlgorithmic(uint16_t norm16)512 UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; } 513 514 // For use with isCompYes(). 515 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 516 // static uint8_t getCCFromYes(uint16_t norm16) { 517 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 518 // } getCCFromNoNo(uint16_t norm16)519 uint8_t getCCFromNoNo(uint16_t norm16) const { 520 const uint16_t *mapping=getMapping(norm16); 521 if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { 522 return (uint8_t)*(mapping-1); 523 } else { 524 return 0; 525 } 526 } 527 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 528 uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const; 529 530 // Requires algorithmic-NoNo. mapAlgorithmic(UChar32 c,uint16_t norm16)531 UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { 532 return c+norm16-(minMaybeYes-MAX_DELTA-1); 533 } 534 535 // Requires minYesNo<norm16<limitNoNo. getMapping(uint16_t norm16)536 const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; } getCompositionsListForDecompYes(uint16_t norm16)537 const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { 538 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 539 return NULL; 540 } else if(norm16<minMaybeYes) { 541 return extraData+norm16; // for yesYes; if Jamo L: harmless empty list 542 } else { 543 return maybeYesCompositions+norm16-minMaybeYes; 544 } 545 } getCompositionsListForComposite(uint16_t norm16)546 const uint16_t *getCompositionsListForComposite(uint16_t norm16) const { 547 const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list 548 return list+ // mapping pointer 549 1+ // +1 to skip the first unit with the mapping lenth 550 (*list&MAPPING_LENGTH_MASK); // + mapping length 551 } 552 /** 553 * @param c code point must have compositions 554 * @return compositions list pointer 555 */ getCompositionsList(uint16_t norm16)556 const uint16_t *getCompositionsList(uint16_t norm16) const { 557 return isDecompYes(norm16) ? 558 getCompositionsListForDecompYes(norm16) : 559 getCompositionsListForComposite(norm16); 560 } 561 562 const UChar *copyLowPrefixFromNulTerminated(const UChar *src, 563 UChar32 minNeedDataCP, 564 ReorderingBuffer *buffer, 565 UErrorCode &errorCode) const; 566 UBool decomposeShort(const UChar *src, const UChar *limit, 567 ReorderingBuffer &buffer, UErrorCode &errorCode) const; 568 UBool decompose(UChar32 c, uint16_t norm16, 569 ReorderingBuffer &buffer, UErrorCode &errorCode) const; 570 571 static int32_t combine(const uint16_t *list, UChar32 trail); 572 void addComposites(const uint16_t *list, UnicodeSet &set) const; 573 void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 574 UBool onlyContiguous) const; 575 576 UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const; 577 const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const; 578 const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; 579 580 const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const; 581 const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; 582 583 int32_t getCanonValue(UChar32 c) const; 584 const UnicodeSet &getCanonStartSet(int32_t n) const; 585 586 // UVersionInfo dataVersion; 587 588 // Code point thresholds for quick check codes. 589 UChar32 minDecompNoCP; 590 UChar32 minCompNoMaybeCP; 591 592 // Norm16 value thresholds for quick check combinations and types of extra data. 593 uint16_t minYesNo; 594 uint16_t minYesNoMappingsOnly; 595 uint16_t minNoNo; 596 uint16_t limitNoNo; 597 uint16_t minMaybeYes; 598 599 const UTrie2 *normTrie; 600 const uint16_t *maybeYesCompositions; 601 const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 602 const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 603 uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F 604 605 public: // CanonIterData is public to allow access from C callback functions. 606 UInitOnce fCanonIterDataInitOnce; 607 CanonIterData *fCanonIterData; 608 }; 609 610 // bits in canonIterData 611 #define CANON_NOT_SEGMENT_STARTER 0x80000000 612 #define CANON_HAS_COMPOSITIONS 0x40000000 613 #define CANON_HAS_SET 0x200000 614 #define CANON_VALUE_MASK 0x1fffff 615 616 /** 617 * ICU-internal shortcut for quick access to standard Unicode normalization. 618 */ 619 class U_COMMON_API Normalizer2Factory { 620 public: 621 static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); 622 static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); 623 static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); 624 625 static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode); 626 627 static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode); 628 static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode); 629 static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode); 630 631 // Get the Impl instance of the Normalizer2. 632 // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. 633 static const Normalizer2Impl *getImpl(const Normalizer2 *norm2); 634 private: 635 Normalizer2Factory(); // No instantiation. 636 }; 637 638 U_NAMESPACE_END 639 640 U_CAPI int32_t U_EXPORT2 641 unorm2_swap(const UDataSwapper *ds, 642 const void *inData, int32_t length, void *outData, 643 UErrorCode *pErrorCode); 644 645 /** 646 * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). 647 * @internal 648 */ 649 U_CFUNC UNormalizationCheckResult 650 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); 651 652 /** 653 * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue(). 654 * @internal 655 */ 656 U_CFUNC uint16_t 657 unorm_getFCD16(UChar32 c); 658 659 /** 660 * Format of Normalizer2 .nrm data files. 661 * Format version 2.0. 662 * 663 * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms. 664 * ICU ships with data files for standard Unicode Normalization Forms 665 * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm). 666 * Custom (application-specific) data can be built into additional .nrm files 667 * with the gennorm2 build tool. 668 * 669 * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been 670 * cached already. Internally, Normalizer2Impl.load() reads the .nrm file. 671 * 672 * A .nrm file begins with a standard ICU data file header 673 * (DataHeader, see ucmndata.h and unicode/udata.h). 674 * The UDataInfo.dataVersion field usually contains the Unicode version 675 * for which the data was generated. 676 * 677 * After the header, the file contains the following parts. 678 * Constants are defined as enum values of the Normalizer2Impl class. 679 * 680 * Many details of the data structures are described in the design doc 681 * which is at http://site.icu-project.org/design/normalization/custom 682 * 683 * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4; 684 * 685 * The first eight indexes are byte offsets in ascending order. 686 * Each byte offset marks the start of the next part in the data file, 687 * and the end of the previous one. 688 * When two consecutive byte offsets are the same, then the corresponding part is empty. 689 * Byte offsets are offsets from after the header, 690 * that is, from the beginning of the indexes[]. 691 * Each part starts at an offset with proper alignment for its data. 692 * If necessary, the previous part may include padding bytes to achieve this alignment. 693 * 694 * minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point 695 * with a decomposition mapping, that is, with NF*D_QC=No. 696 * minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point 697 * with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward). 698 * 699 * The next five indexes are thresholds of 16-bit trie values for ranges of 700 * values indicating multiple normalization properties. 701 * minYesNo=indexes[IX_MIN_YES_NO]; 702 * minNoNo=indexes[IX_MIN_NO_NO]; 703 * limitNoNo=indexes[IX_LIMIT_NO_NO]; 704 * minMaybeYes=indexes[IX_MIN_MAYBE_YES]; 705 * minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 706 * See the normTrie description below and the design doc for details. 707 * 708 * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h 709 * 710 * The trie holds the main normalization data. Each code point is mapped to a 16-bit value. 711 * Rather than using independent bits in the value (which would require more than 16 bits), 712 * information is extracted primarily via range checks. 713 * For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo 714 * means that the character has NF*C_QC=Yes and NF*D_QC=No properties, 715 * which means it has a two-way (round-trip) decomposition mapping. 716 * Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData 717 * pointing to mappings, compositions lists, or both. 718 * Value norm16==0 means that the character is normalization-inert, that is, 719 * it does not have a mapping, does not participate in composition, has a zero 720 * canonical combining class, and forms a boundary where text before it and after it 721 * can be normalized independently. 722 * For details about how multiple properties are encoded in 16-bit values 723 * see the design doc. 724 * Note that the encoding cannot express all combinations of the properties involved; 725 * it only supports those combinations that are allowed by 726 * the Unicode Normalization algorithms. Details are in the design doc as well. 727 * The gennorm2 tool only builds .nrm files for data that conforms to the limitations. 728 * 729 * The trie has a value for each lead surrogate code unit representing the "worst case" 730 * properties of the 1024 supplementary characters whose UTF-16 form starts with 731 * the lead surrogate. If all of the 1024 supplementary characters are normalization-inert, 732 * then their lead surrogate code unit has the trie value 0. 733 * When the lead surrogate unit's value exceeds the quick check minimum during processing, 734 * the properties for the full supplementary code point need to be looked up. 735 * 736 * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]; 737 * uint16_t extraData[]; 738 * 739 * There is only one byte offset for the end of these two arrays. 740 * The split between them is given by the constant and variable mentioned above. 741 * 742 * The maybeYesCompositions array contains compositions lists for characters that 743 * combine both forward (as starters in composition pairs) 744 * and backward (as trailing characters in composition pairs). 745 * Such characters do not occur in Unicode 5.2 but are allowed by 746 * the Unicode Normalization algorithms. 747 * If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES 748 * and the maybeYesCompositions array is empty. 749 * If there are such characters, then minMaybeYes is subtracted from their norm16 values 750 * to get the index into this array. 751 * 752 * The extraData array contains compositions lists for "YesYes" characters, 753 * followed by mappings and optional compositions lists for "YesNo" characters, 754 * followed by only mappings for "NoNo" characters. 755 * (Referring to pairs of NFC/NFD quick check values.) 756 * The norm16 values of those characters are directly indexes into the extraData array. 757 * 758 * The data structures for compositions lists and mappings are described in the design doc. 759 * 760 * uint8_t smallFCD[0x100]; -- new in format version 2 761 * 762 * This is a bit set to help speed up FCD value lookups in the absence of a full 763 * UTrie2 or other large data structure with the full FCD value mapping. 764 * 765 * Each smallFCD bit is set if any of the corresponding 32 BMP code points 766 * has a non-zero FCD value (lccc!=0 or tccc!=0). 767 * Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF. 768 * A bit for 32 lead surrogates is set if any of the 32k corresponding 769 * _supplementary_ code points has a non-zero FCD value. 770 * 771 * This bit set is most useful for the large blocks of CJK characters with FCD=0. 772 * 773 * Changes from format version 1 to format version 2 --------------------------- 774 * 775 * - Addition of data for raw (not recursively decomposed) mappings. 776 * + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when 777 * the mapping is to an empty string or when the character combines-forward. 778 * This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which 779 * is then repurposed for the MAPPING_HAS_RAW_MAPPING bit. 780 * + For details see the design doc. 781 * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into 782 * distinct ranges (combines-forward vs. not) 783 * so that a range check can be used to find out if there is a compositions list. 784 * This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag. 785 * It is needed for the new (in ICU 49) composePair(), not for other normalization. 786 * - Addition of the smallFCD[] bit set. 787 */ 788 789 #endif /* !UCONFIG_NO_NORMALIZATION */ 790 #endif /* __NORMALIZER2IMPL_H__ */ 791