1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  *   Copyright (C) 2009-2015, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.impl;
11 
12 import java.io.IOException;
13 import java.nio.ByteBuffer;
14 import java.util.ArrayList;
15 
16 import com.ibm.icu.text.UTF16;
17 import com.ibm.icu.text.UnicodeSet;
18 import com.ibm.icu.util.CodePointMap;
19 import com.ibm.icu.util.CodePointTrie;
20 import com.ibm.icu.util.ICUUncheckedIOException;
21 import com.ibm.icu.util.MutableCodePointTrie;
22 import com.ibm.icu.util.VersionInfo;
23 
24 /**
25  * Low-level implementation of the Unicode Normalization Algorithm.
26  * For the data structure and details see the documentation at the end of
27  * C++ normalizer2impl.h and in the design doc at
28  * http://site.icu-project.org/design/normalization/custom
29  */
30 public final class Normalizer2Impl {
31     public static final class Hangul {
32         /* Korean Hangul and Jamo constants */
33         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
34         public static final int JAMO_L_END=0x1112;
35         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
36         public static final int JAMO_V_END=0x1175;
37         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
38         public static final int JAMO_T_END=0x11c2;
39 
40         public static final int HANGUL_BASE=0xac00;
41         public static final int HANGUL_END=0xd7a3;
42 
43         public static final int JAMO_L_COUNT=19;
44         public static final int JAMO_V_COUNT=21;
45         public static final int JAMO_T_COUNT=28;
46 
47         public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
48         public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
49 
50         public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
51 
52         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
53         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
54 
isHangul(int c)55         public static boolean isHangul(int c) {
56             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
57         }
isHangulLV(int c)58         public static boolean isHangulLV(int c) {
59             c-=HANGUL_BASE;
60             return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
61         }
isJamoL(int c)62         public static boolean isJamoL(int c) {
63             return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
64         }
isJamoV(int c)65         public static boolean isJamoV(int c) {
66             return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
67         }
isJamoT(int c)68         public static boolean isJamoT(int c) {
69             int t=c-JAMO_T_BASE;
70             return 0<t && t<JAMO_T_COUNT;  // not JAMO_T_BASE itself
71         }
isJamo(int c)72         public static boolean isJamo(int c) {
73             return JAMO_L_BASE<=c && c<=JAMO_T_END &&
74                 (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
75         }
76 
77         /**
78          * Decomposes c, which must be a Hangul syllable, into buffer
79          * and returns the length of the decomposition (2 or 3).
80          */
decompose(int c, Appendable buffer)81         public static int decompose(int c, Appendable buffer) {
82             try {
83                 c-=HANGUL_BASE;
84                 int c2=c%JAMO_T_COUNT;
85                 c/=JAMO_T_COUNT;
86                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
87                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
88                 if(c2==0) {
89                     return 2;
90                 } else {
91                     buffer.append((char)(JAMO_T_BASE+c2));
92                     return 3;
93                 }
94             } catch(IOException e) {
95                 // Will not occur because we do not write to I/O.
96                 throw new ICUUncheckedIOException(e);
97             }
98         }
99 
100         /**
101          * Decomposes c, which must be a Hangul syllable, into buffer.
102          * This is the raw, not recursive, decomposition. Its length is always 2.
103          */
getRawDecomposition(int c, Appendable buffer)104         public static void getRawDecomposition(int c, Appendable buffer) {
105             try {
106                 int orig=c;
107                 c-=HANGUL_BASE;
108                 int c2=c%JAMO_T_COUNT;
109                 if(c2==0) {
110                     c/=JAMO_T_COUNT;
111                     buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
112                     buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
113                 } else {
114                     buffer.append((char)(orig-c2));  // LV syllable
115                     buffer.append((char)(JAMO_T_BASE+c2));
116                 }
117             } catch(IOException e) {
118                 // Will not occur because we do not write to I/O.
119                 throw new ICUUncheckedIOException(e);
120             }
121         }
122     }
123 
124     /**
125      * Writable buffer that takes care of canonical ordering.
126      * Its Appendable methods behave like the C++ implementation's
127      * appendZeroCC() methods.
128      * <p>
129      * If dest is a StringBuilder, then the buffer writes directly to it.
130      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
131      * until no further changes are necessary and whole segments are appended.
132      * append() methods that take combining-class values always write to the StringBuilder.
133      * Other append() methods flush and append to the Appendable.
134      */
135     public static final class ReorderingBuffer implements Appendable {
ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity)136         public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
137             impl=ni;
138             app=dest;
139             if(app instanceof StringBuilder) {
140                 appIsStringBuilder=true;
141                 str=(StringBuilder)dest;
142                 // In Java, the constructor subsumes public void init(int destCapacity) {
143                 str.ensureCapacity(destCapacity);
144                 reorderStart=0;
145                 if(str.length()==0) {
146                     lastCC=0;
147                 } else {
148                     setIterator();
149                     lastCC=previousCC();
150                     // Set reorderStart after the last code point with cc<=1 if there is one.
151                     if(lastCC>1) {
152                         while(previousCC()>1) {}
153                     }
154                     reorderStart=codePointLimit;
155                 }
156             } else {
157                 appIsStringBuilder=false;
158                 str=new StringBuilder();
159                 reorderStart=0;
160                 lastCC=0;
161             }
162         }
163 
isEmpty()164         public boolean isEmpty() { return str.length()==0; }
length()165         public int length() { return str.length(); }
getLastCC()166         public int getLastCC() { return lastCC; }
167 
getStringBuilder()168         public StringBuilder getStringBuilder() { return str; }
169 
equals(CharSequence s, int start, int limit)170         public boolean equals(CharSequence s, int start, int limit) {
171             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
172         }
173 
append(int c, int cc)174         public void append(int c, int cc) {
175             if(lastCC<=cc || cc==0) {
176                 str.appendCodePoint(c);
177                 lastCC=cc;
178                 if(cc<=1) {
179                     reorderStart=str.length();
180                 }
181             } else {
182                 insert(c, cc);
183             }
184         }
append(CharSequence s, int start, int limit, boolean isNFD, int leadCC, int trailCC)185         public void append(CharSequence s, int start, int limit, boolean isNFD,
186                            int leadCC, int trailCC) {
187             if(start==limit) {
188                 return;
189             }
190             if(lastCC<=leadCC || leadCC==0) {
191                 if(trailCC<=1) {
192                     reorderStart=str.length()+(limit-start);
193                 } else if(leadCC<=1) {
194                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
195                 }
196                 str.append(s, start, limit);
197                 lastCC=trailCC;
198             } else {
199                 int c=Character.codePointAt(s, start);
200                 start+=Character.charCount(c);
201                 insert(c, leadCC);  // insert first code point
202                 while(start<limit) {
203                     c=Character.codePointAt(s, start);
204                     start+=Character.charCount(c);
205                     if(start<limit) {
206                         if (isNFD) {
207                             leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
208                         } else {
209                             leadCC = impl.getCC(impl.getNorm16(c));
210                         }
211                     } else {
212                         leadCC=trailCC;
213                     }
214                     append(c, leadCC);
215                 }
216             }
217         }
218         // The following append() methods work like C++ appendZeroCC().
219         // They assume that the cc or trailCC of their input is 0.
220         // Most of them implement Appendable interface methods.
221         @Override
append(char c)222         public ReorderingBuffer append(char c) {
223             str.append(c);
224             lastCC=0;
225             reorderStart=str.length();
226             return this;
227         }
appendZeroCC(int c)228         public void appendZeroCC(int c) {
229             str.appendCodePoint(c);
230             lastCC=0;
231             reorderStart=str.length();
232         }
233         @Override
append(CharSequence s)234         public ReorderingBuffer append(CharSequence s) {
235             if(s.length()!=0) {
236                 str.append(s);
237                 lastCC=0;
238                 reorderStart=str.length();
239             }
240             return this;
241         }
242         @Override
append(CharSequence s, int start, int limit)243         public ReorderingBuffer append(CharSequence s, int start, int limit) {
244             if(start!=limit) {
245                 str.append(s, start, limit);
246                 lastCC=0;
247                 reorderStart=str.length();
248             }
249             return this;
250         }
251         /**
252          * Flushes from the intermediate StringBuilder to the Appendable,
253          * if they are different objects.
254          * Used after recomposition.
255          * Must be called at the end when writing to a non-StringBuilder Appendable.
256          */
flush()257         public void flush() {
258             if(appIsStringBuilder) {
259                 reorderStart=str.length();
260             } else {
261                 try {
262                     app.append(str);
263                     str.setLength(0);
264                     reorderStart=0;
265                 } catch(IOException e) {
266                     throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
267                 }
268             }
269             lastCC=0;
270         }
271         /**
272          * Flushes from the intermediate StringBuilder to the Appendable,
273          * if they are different objects.
274          * Then appends the new text to the Appendable or StringBuilder.
275          * Normally used after quick check loops find a non-empty sequence.
276          */
flushAndAppendZeroCC(CharSequence s, int start, int limit)277         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
278             if(appIsStringBuilder) {
279                 str.append(s, start, limit);
280                 reorderStart=str.length();
281             } else {
282                 try {
283                     app.append(str).append(s, start, limit);
284                     str.setLength(0);
285                     reorderStart=0;
286                 } catch(IOException e) {
287                     throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
288                 }
289             }
290             lastCC=0;
291             return this;
292         }
remove()293         public void remove() {
294             str.setLength(0);
295             lastCC=0;
296             reorderStart=0;
297         }
removeSuffix(int suffixLength)298         public void removeSuffix(int suffixLength) {
299             int oldLength=str.length();
300             str.delete(oldLength-suffixLength, oldLength);
301             lastCC=0;
302             reorderStart=str.length();
303         }
304 
305         /*
306          * TODO: Revisit whether it makes sense to track reorderStart.
307          * It is set to after the last known character with cc<=1,
308          * which stops previousCC() before it reads that character and looks up its cc.
309          * previousCC() is normally only called from insert().
310          * In other words, reorderStart speeds up the insertion of a combining mark
311          * into a multi-combining mark sequence where it does not belong at the end.
312          * This might not be worth the trouble.
313          * On the other hand, it's not a huge amount of trouble.
314          *
315          * We probably need it for UNORM_SIMPLE_APPEND.
316          */
317 
318         // Inserts c somewhere before the last character.
319         // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(int c, int cc)320         private void insert(int c, int cc) {
321             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
322             // insert c at codePointLimit, after the character with prevCC<=cc
323             if(c<=0xffff) {
324                 str.insert(codePointLimit, (char)c);
325                 if(cc<=1) {
326                     reorderStart=codePointLimit+1;
327                 }
328             } else {
329                 str.insert(codePointLimit, Character.toChars(c));
330                 if(cc<=1) {
331                     reorderStart=codePointLimit+2;
332                 }
333             }
334         }
335 
336         private final Normalizer2Impl impl;
337         private final Appendable app;
338         private final StringBuilder str;
339         private final boolean appIsStringBuilder;
340         private int reorderStart;
341         private int lastCC;
342 
343         // private backward iterator
setIterator()344         private void setIterator() { codePointStart=str.length(); }
skipPrevious()345         private void skipPrevious() {  // Requires 0<codePointStart.
346             codePointLimit=codePointStart;
347             codePointStart=str.offsetByCodePoints(codePointStart, -1);
348         }
previousCC()349         private int previousCC() {  // Returns 0 if there is no previous character.
350             codePointLimit=codePointStart;
351             if(reorderStart>=codePointStart) {
352                 return 0;
353             }
354             int c=str.codePointBefore(codePointStart);
355             codePointStart-=Character.charCount(c);
356             return impl.getCCFromYesOrMaybeCP(c);
357         }
358 
359         private int codePointStart, codePointLimit;
360     }
361 
362     // TODO: Propose as public API on the UTF16 class.
363     // TODO: Propose widening UTF16 methods that take char to take int.
364     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
365     public static final class UTF16Plus {
366         /**
367          * Is this code point a lead surrogate (U+d800..U+dbff)?
368          * @param c code unit or code point
369          * @return true or false
370          */
isLeadSurrogate(int c)371         public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
372         /**
373          * Is this code point a trail surrogate (U+dc00..U+dfff)?
374          * @param c code unit or code point
375          * @return true or false
376          */
isTrailSurrogate(int c)377         public static boolean isTrailSurrogate(int c) { return (c & 0xfffffc00) == 0xdc00; }
378         /**
379          * Is this code point a surrogate (U+d800..U+dfff)?
380          * @param c code unit or code point
381          * @return true or false
382          */
isSurrogate(int c)383         public static boolean isSurrogate(int c) { return (c & 0xfffff800) == 0xd800; }
384         /**
385          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
386          * is it a lead surrogate?
387          * @param c code unit or code point
388          * @return true or false
389          */
isSurrogateLead(int c)390         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
391         /**
392          * Compares two CharSequence objects for binary equality.
393          * @param s1 first sequence
394          * @param s2 second sequence
395          * @return true if s1 contains the same text as s2
396          */
equal(CharSequence s1, CharSequence s2)397         public static boolean equal(CharSequence s1,  CharSequence s2) {
398             if(s1==s2) {
399                 return true;
400             }
401             int length=s1.length();
402             if(length!=s2.length()) {
403                 return false;
404             }
405             for(int i=0; i<length; ++i) {
406                 if(s1.charAt(i)!=s2.charAt(i)) {
407                     return false;
408                 }
409             }
410             return true;
411         }
412         /**
413          * Compares two CharSequence subsequences for binary equality.
414          * @param s1 first sequence
415          * @param start1 start offset in first sequence
416          * @param limit1 limit offset in first sequence
417          * @param s2 second sequence
418          * @param start2 start offset in second sequence
419          * @param limit2 limit offset in second sequence
420          * @return true if s1.subSequence(start1, limit1) contains the same text
421          *              as s2.subSequence(start2, limit2)
422          */
equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2)423         public static boolean equal(CharSequence s1, int start1, int limit1,
424                                     CharSequence s2, int start2, int limit2) {
425             if((limit1-start1)!=(limit2-start2)) {
426                 return false;
427             }
428             if(s1==s2 && start1==start2) {
429                 return true;
430             }
431             while(start1<limit1) {
432                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
433                     return false;
434                 }
435             }
436             return true;
437         }
438     }
439 
Normalizer2Impl()440     public Normalizer2Impl() {}
441 
442     private static final class IsAcceptable implements ICUBinary.Authenticate {
443         @Override
isDataVersionAcceptable(byte version[])444         public boolean isDataVersionAcceptable(byte version[]) {
445             return version[0]==4;
446         }
447     }
448     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
449     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
450 
load(ByteBuffer bytes)451     public Normalizer2Impl load(ByteBuffer bytes) {
452         try {
453             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
454             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
455             if(indexesLength<=IX_MIN_LCCC_CP) {
456                 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes");
457             }
458             int[] inIndexes=new int[indexesLength];
459             inIndexes[0]=indexesLength*4;
460             for(int i=1; i<indexesLength; ++i) {
461                 inIndexes[i]=bytes.getInt();
462             }
463 
464             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
465             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
466             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
467 
468             minYesNo=inIndexes[IX_MIN_YES_NO];
469             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
470             minNoNo=inIndexes[IX_MIN_NO_NO];
471             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
472             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
473             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
474             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
475             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
476             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
477             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
478 
479             // Read the normTrie.
480             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
481             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
482             int triePosition = bytes.position();
483             normTrie = CodePointTrie.Fast16.fromBinary(bytes);
484             int trieLength = bytes.position() - triePosition;
485             if(trieLength>(nextOffset-offset)) {
486                 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
487             }
488             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
489 
490             // Read the composition and mapping data.
491             offset=nextOffset;
492             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
493             int numChars=(nextOffset-offset)/2;
494             if(numChars!=0) {
495                 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
496                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
497             }
498 
499             // smallFCD: new in formatVersion 2
500             offset=nextOffset;
501             smallFCD=new byte[0x100];
502             bytes.get(smallFCD);
503 
504             return this;
505         } catch(IOException e) {
506             throw new ICUUncheckedIOException(e);
507         }
508     }
load(String name)509     public Normalizer2Impl load(String name) {
510         return load(ICUBinary.getRequiredData(name));
511     }
512 
addLcccChars(UnicodeSet set)513     public void addLcccChars(UnicodeSet set) {
514         int start = 0;
515         CodePointMap.Range range = new CodePointMap.Range();
516         while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
517                 null, range)) {
518             int end = range.getEnd();
519             int norm16 = range.getValue();
520             if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) {
521                 set.add(start, end);
522             } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
523                 int fcd16 = getFCD16(start);
524                 if (fcd16 > 0xff) { set.add(start, end); }
525             }
526             start = end + 1;
527         }
528     }
529 
addPropertyStarts(UnicodeSet set)530     public void addPropertyStarts(UnicodeSet set) {
531         // Add the start code point of each same-value range of the trie.
532         int start = 0;
533         CodePointMap.Range range = new CodePointMap.Range();
534         while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
535                 null, range)) {
536             int end = range.getEnd();
537             int value = range.getValue();
538             set.add(start);
539             if (start != end && isAlgorithmicNoNo(value) &&
540                     (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) {
541                 // Range of code points with same-norm16-value algorithmic decompositions.
542                 // They might have different non-zero FCD16 values.
543                 int prevFCD16 = getFCD16(start);
544                 while (++start <= end) {
545                     int fcd16 = getFCD16(start);
546                     if (fcd16 != prevFCD16) {
547                         set.add(start);
548                         prevFCD16 = fcd16;
549                     }
550                 }
551             }
552             start = end + 1;
553         }
554 
555         /* add Hangul LV syllables and LV+1 because of skippables */
556         for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
557             set.add(c);
558             set.add(c+1);
559         }
560         set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
561     }
562 
addCanonIterPropertyStarts(UnicodeSet set)563     public void addCanonIterPropertyStarts(UnicodeSet set) {
564         // Add the start code point of each same-value range of the canonical iterator data trie.
565         ensureCanonIterData();
566         // Currently only used for the SEGMENT_STARTER property.
567         int start = 0;
568         CodePointMap.Range range = new CodePointMap.Range();
569         while (canonIterData.getRange(start, segmentStarterMapper, range)) {
570             set.add(start);
571             start = range.getEnd() + 1;
572         }
573     }
574     private static final CodePointMap.ValueFilter segmentStarterMapper =
575             new CodePointMap.ValueFilter() {
576         @Override
577         public int apply(int value) {
578             return value & CANON_NOT_SEGMENT_STARTER;
579         }
580     };
581 
582     // low-level properties ------------------------------------------------ ***
583 
584     // Note: Normalizer2Impl.java r30983 (2011-nov-27)
585     // still had getFCDTrie() which built and cached an FCD trie.
586     // That provided faster access to FCD data than getFCD16FromNormData()
587     // but required synchronization and consumed some 10kB of heap memory
588     // in any process that uses FCD (e.g., via collation).
589     // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
590     // at least for ASCII & CJK.
591 
592     /**
593      * Builds the canonical-iterator data for this instance.
594      * This is required before any of {@link #isCanonSegmentStarter(int)} or
595      * {@link #getCanonStartSet(int, UnicodeSet)} are called,
596      * or else they crash.
597      * @return this
598      */
ensureCanonIterData()599     public synchronized Normalizer2Impl ensureCanonIterData() {
600         if(canonIterData==null) {
601             MutableCodePointTrie mutableTrie = new MutableCodePointTrie(0, 0);
602             canonStartSets=new ArrayList<UnicodeSet>();
603             int start = 0;
604             CodePointMap.Range range = new CodePointMap.Range();
605             while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
606                     null, range)) {
607                 final int end = range.getEnd();
608                 final int norm16 = range.getValue();
609                 if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
610                     // Inert, or 2-way mapping (including Hangul syllable).
611                     // We do not write a canonStartSet for any yesNo character.
612                     // Composites from 2-way mappings are added at runtime from the
613                     // starter's compositions list, and the other characters in
614                     // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
615                     // "maybe" characters.
616                     start = end + 1;
617                     continue;
618                 }
619                 for (int c = start; c <= end; ++c) {
620                     final int oldValue = mutableTrie.get(c);
621                     int newValue=oldValue;
622                     if(isMaybeOrNonZeroCC(norm16)) {
623                         // not a segment starter if it occurs in a decomposition or has cc!=0
624                         newValue|=CANON_NOT_SEGMENT_STARTER;
625                         if(norm16<MIN_NORMAL_MAYBE_YES) {
626                             newValue|=CANON_HAS_COMPOSITIONS;
627                         }
628                     } else if(norm16<minYesNo) {
629                         newValue|=CANON_HAS_COMPOSITIONS;
630                     } else {
631                         // c has a one-way decomposition
632                         int c2=c;
633                         // Do not modify the whole-range norm16 value.
634                         int norm16_2=norm16;
635                         if (isDecompNoAlgorithmic(norm16_2)) {
636                             // Maps to an isCompYesAndZeroCC.
637                             c2 = mapAlgorithmic(c2, norm16_2);
638                             norm16_2 = getRawNorm16(c2);
639                             // No compatibility mappings for the CanonicalIterator.
640                             assert(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
641                         }
642                         if (norm16_2 > minYesNo) {
643                             // c decomposes, get everything from the variable-length extra data
644                             int mapping=norm16_2>>OFFSET_SHIFT;
645                             int firstUnit=extraData.charAt(mapping);
646                             int length=firstUnit&MAPPING_LENGTH_MASK;
647                             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
648                                 if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) {
649                                     newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
650                                 }
651                             }
652                             // Skip empty mappings (no characters in the decomposition).
653                             if(length!=0) {
654                                 ++mapping;  // skip over the firstUnit
655                                 // add c to first code point's start set
656                                 int limit=mapping+length;
657                                 c2=extraData.codePointAt(mapping);
658                                 addToStartSet(mutableTrie, c, c2);
659                                 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
660                                 // one-way mapping. A 2-way mapping is possible here after
661                                 // intermediate algorithmic mapping.
662                                 if(norm16_2>=minNoNo) {
663                                     while((mapping+=Character.charCount(c2))<limit) {
664                                         c2=extraData.codePointAt(mapping);
665                                         int c2Value = mutableTrie.get(c2);
666                                         if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
667                                             mutableTrie.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
668                                         }
669                                     }
670                                 }
671                             }
672                         } else {
673                             // c decomposed to c2 algorithmically; c has cc==0
674                             addToStartSet(mutableTrie, c, c2);
675                         }
676                     }
677                     if(newValue!=oldValue) {
678                         mutableTrie.set(c, newValue);
679                     }
680                 }
681                 start = end + 1;
682             }
683             canonIterData = mutableTrie.buildImmutable(
684                     CodePointTrie.Type.SMALL, CodePointTrie.ValueWidth.BITS_32);
685         }
686         return this;
687     }
688 
689     // The trie stores values for lead surrogate code *units*.
690     // Surrogate code *points* are inert.
getNorm16(int c)691     public int getNorm16(int c) {
692         return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
693     }
getRawNorm16(int c)694     public int getRawNorm16(int c) { return normTrie.get(c); }
695 
getCompQuickCheck(int norm16)696     public int getCompQuickCheck(int norm16) {
697         if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
698             return 1;  // yes
699         } else if(minMaybeYes<=norm16) {
700             return 2;  // maybe
701         } else {
702             return 0;  // no
703         }
704     }
isAlgorithmicNoNo(int norm16)705     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
isCompNo(int norm16)706     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
isDecompYes(int norm16)707     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
708 
getCC(int norm16)709     public int getCC(int norm16) {
710         if(norm16>=MIN_NORMAL_MAYBE_YES) {
711             return getCCFromNormalYesOrMaybe(norm16);
712         }
713         if(norm16<minNoNo || limitNoNo<=norm16) {
714             return 0;
715         }
716         return getCCFromNoNo(norm16);
717     }
getCCFromNormalYesOrMaybe(int norm16)718     public static int getCCFromNormalYesOrMaybe(int norm16) {
719         return (norm16 >> OFFSET_SHIFT) & 0xff;
720     }
getCCFromYesOrMaybe(int norm16)721     public static int getCCFromYesOrMaybe(int norm16) {
722         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
723     }
getCCFromYesOrMaybeCP(int c)724     public int getCCFromYesOrMaybeCP(int c) {
725         if (c < minCompNoMaybeCP) { return 0; }
726         return getCCFromYesOrMaybe(getNorm16(c));
727     }
728 
729     /**
730      * Returns the FCD data for code point c.
731      * @param c A Unicode code point.
732      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
733      */
getFCD16(int c)734     public int getFCD16(int c) {
735         if(c<minDecompNoCP) {
736             return 0;
737         } else if(c<=0xffff) {
738             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
739         }
740         return getFCD16FromNormData(c);
741     }
742     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
singleLeadMightHaveNonZeroFCD16(int lead)743     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
744         // 0<=lead<=0xffff
745         byte bits=smallFCD[lead>>8];
746         if(bits==0) { return false; }
747         return ((bits>>((lead>>5)&7))&1)!=0;
748     }
749 
750     /** Gets the FCD value from the regular normalization data. */
getFCD16FromNormData(int c)751     public int getFCD16FromNormData(int c) {
752         int norm16=getNorm16(c);
753         if (norm16 >= limitNoNo) {
754             if(norm16>=MIN_NORMAL_MAYBE_YES) {
755                 // combining mark
756                 norm16=getCCFromNormalYesOrMaybe(norm16);
757                 return norm16|(norm16<<8);
758             } else if(norm16>=minMaybeYes) {
759                 return 0;
760             } else {  // isDecompNoAlgorithmic(norm16)
761                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
762                 if (deltaTrailCC <= DELTA_TCCC_1) {
763                     return deltaTrailCC >> OFFSET_SHIFT;
764                 }
765                 // Maps to an isCompYesAndZeroCC.
766                 c=mapAlgorithmic(c, norm16);
767                 norm16 = getRawNorm16(c);
768             }
769         }
770         if(norm16<=minYesNo || isHangulLVT(norm16)) {
771             // no decomposition or Hangul syllable, all zeros
772             return 0;
773         }
774         // c decomposes, get everything from the variable-length extra data
775         int mapping=norm16>>OFFSET_SHIFT;
776         int firstUnit=extraData.charAt(mapping);
777         int fcd16=firstUnit>>8;  // tccc
778         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
779             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
780         }
781         return fcd16;
782     }
783 
784     /**
785      * Gets the decomposition for one code point.
786      * @param c code point
787      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
788      */
getDecomposition(int c)789     public String getDecomposition(int c) {
790         int norm16;
791         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
792             // c does not decompose
793             return null;
794         }
795         int decomp = -1;
796         if(isDecompNoAlgorithmic(norm16)) {
797             // Maps to an isCompYesAndZeroCC.
798             decomp=c=mapAlgorithmic(c, norm16);
799             // The mapping might decompose further.
800             norm16 = getRawNorm16(c);
801         }
802         if (norm16 < minYesNo) {
803             if(decomp<0) {
804                 return null;
805             } else {
806                 return UTF16.valueOf(decomp);
807             }
808         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
809             // Hangul syllable: decompose algorithmically
810             StringBuilder buffer=new StringBuilder();
811             Hangul.decompose(c, buffer);
812             return buffer.toString();
813         }
814         // c decomposes, get everything from the variable-length extra data
815         int mapping=norm16>>OFFSET_SHIFT;
816         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
817         return extraData.substring(mapping, mapping+length);
818     }
819 
820     /**
821      * Gets the raw decomposition for one code point.
822      * @param c code point
823      * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
824      */
getRawDecomposition(int c)825     public String getRawDecomposition(int c) {
826         int norm16;
827         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
828             // c does not decompose
829             return null;
830         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
831             // Hangul syllable: decompose algorithmically
832             StringBuilder buffer=new StringBuilder();
833             Hangul.getRawDecomposition(c, buffer);
834             return buffer.toString();
835         } else if(isDecompNoAlgorithmic(norm16)) {
836             return UTF16.valueOf(mapAlgorithmic(c, norm16));
837         }
838         // c decomposes, get everything from the variable-length extra data
839         int mapping=norm16>>OFFSET_SHIFT;
840         int firstUnit=extraData.charAt(mapping);
841         int mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
842         if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) {
843             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
844             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
845             int rawMapping=mapping-((firstUnit>>7)&1)-1;
846             char rm0=extraData.charAt(rawMapping);
847             if(rm0<=MAPPING_LENGTH_MASK) {
848                 return extraData.substring(rawMapping-rm0, rawMapping);
849             } else {
850                 // Copy the normal mapping and replace its first two code units with rm0.
851                 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
852                 mapping+=1+2;  // skip over the firstUnit and the first two mapping code units
853                 return buffer.append(extraData, mapping, mapping+mLength-2).toString();
854             }
855         } else {
856             mapping+=1;  // skip over the firstUnit
857             return extraData.substring(mapping, mapping+mLength);
858         }
859     }
860 
861     /**
862      * Returns true if code point c starts a canonical-iterator string segment.
863      * <b>{@link #ensureCanonIterData()} must have been called before this method,
864      * or else this method will crash.</b>
865      * @param c A Unicode code point.
866      * @return true if c starts a canonical-iterator string segment.
867      */
isCanonSegmentStarter(int c)868     public boolean isCanonSegmentStarter(int c) {
869         return canonIterData.get(c)>=0;
870     }
871     /**
872      * Returns true if there are characters whose decomposition starts with c.
873      * If so, then the set is cleared and then filled with those characters.
874      * <b>{@link #ensureCanonIterData()} must have been called before this method,
875      * or else this method will crash.</b>
876      * @param c A Unicode code point.
877      * @param set A UnicodeSet to receive the characters whose decompositions
878      *        start with c, if there are any.
879      * @return true if there are characters whose decomposition starts with c.
880      */
getCanonStartSet(int c, UnicodeSet set)881     public boolean getCanonStartSet(int c, UnicodeSet set) {
882         int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
883         if(canonValue==0) {
884             return false;
885         }
886         set.clear();
887         int value=canonValue&CANON_VALUE_MASK;
888         if((canonValue&CANON_HAS_SET)!=0) {
889             set.addAll(canonStartSets.get(value));
890         } else if(value!=0) {
891             set.add(value);
892         }
893         if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
894             int norm16 = getRawNorm16(c);
895             if(norm16==JAMO_L) {
896                 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
897                 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
898             } else {
899                 addComposites(getCompositionsList(norm16), set);
900             }
901         }
902         return true;
903     }
904 
905     // Fixed norm16 values.
906     public static final int MIN_YES_YES_WITH_CC=0xfe02;
907     public static final int JAMO_VT=0xfe00;
908     public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
909     public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
910     public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
911 
912     // norm16 bit 0 is comp-boundary-after.
913     public static final int HAS_COMP_BOUNDARY_AFTER=1;
914     public static final int OFFSET_SHIFT=1;
915 
916     // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
917     // tccc (0, 1, >1) for quick FCC boundary-after tests.
918     public static final int DELTA_TCCC_0=0;
919     public static final int DELTA_TCCC_1=2;
920     public static final int DELTA_TCCC_GT_1=4;
921     public static final int DELTA_TCCC_MASK=6;
922     public static final int DELTA_SHIFT=3;
923 
924     public static final int MAX_DELTA=0x40;
925 
926     // Byte offsets from the start of the data, after the generic header.
927     public static final int IX_NORM_TRIE_OFFSET=0;
928     public static final int IX_EXTRA_DATA_OFFSET=1;
929     public static final int IX_SMALL_FCD_OFFSET=2;
930     public static final int IX_RESERVED3_OFFSET=3;
931     public static final int IX_TOTAL_SIZE=7;
932 
933     // Code point thresholds for quick check codes.
934     public static final int IX_MIN_DECOMP_NO_CP=8;
935     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
936 
937     // Norm16 value thresholds for quick check combinations and types of extra data.
938 
939     /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
940     public static final int IX_MIN_YES_NO=10;
941     /** Mappings are comp-normalized. */
942     public static final int IX_MIN_NO_NO=11;
943     public static final int IX_LIMIT_NO_NO=12;
944     public static final int IX_MIN_MAYBE_YES=13;
945 
946     /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
947     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
948     /** Mappings are not comp-normalized but have a comp boundary before. */
949     public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
950     /** Mappings do not have a comp boundary before. */
951     public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
952     /** Mappings to the empty string. */
953     public static final int IX_MIN_NO_NO_EMPTY=17;
954 
955     public static final int IX_MIN_LCCC_CP=18;
956     public static final int IX_COUNT=20;
957 
958     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
959     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
960     // unused bit 0x20;
961     public static final int MAPPING_LENGTH_MASK=0x1f;
962 
963     public static final int COMP_1_LAST_TUPLE=0x8000;
964     public static final int COMP_1_TRIPLE=1;
965     public static final int COMP_1_TRAIL_LIMIT=0x3400;
966     public static final int COMP_1_TRAIL_MASK=0x7ffe;
967     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
968     public static final int COMP_2_TRAIL_SHIFT=6;
969     public static final int COMP_2_TRAIL_MASK=0xffc0;
970 
971     // higher-level functionality ------------------------------------------ ***
972 
973     // NFD without an NFD Normalizer2 instance.
decompose(CharSequence s, StringBuilder dest)974     public Appendable decompose(CharSequence s, StringBuilder dest) {
975         decompose(s, 0, s.length(), dest, s.length());
976         return dest;
977     }
978     /**
979      * Decomposes s[src, limit[ and writes the result to dest.
980      * limit can be NULL if src is NUL-terminated.
981      * destLengthEstimate is the initial dest buffer capacity and can be -1.
982      */
decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate)983     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
984                    int destLengthEstimate) {
985         if(destLengthEstimate<0) {
986             destLengthEstimate=limit-src;
987         }
988         dest.setLength(0);
989         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
990         decompose(s, src, limit, buffer);
991     }
992 
993     // Dual functionality:
994     // buffer!=NULL: normalize
995     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer)996     public int decompose(CharSequence s, int src, int limit,
997                          ReorderingBuffer buffer) {
998         int minNoCP=minDecompNoCP;
999 
1000         int prevSrc;
1001         int c=0;
1002         int norm16=0;
1003 
1004         // only for quick check
1005         int prevBoundary=src;
1006         int prevCC=0;
1007 
1008         for(;;) {
1009             // count code units below the minimum or with irrelevant data for the quick check
1010             for(prevSrc=src; src!=limit;) {
1011                 if( (c=s.charAt(src))<minNoCP ||
1012                     isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
1013                 ) {
1014                     ++src;
1015                 } else if (!UTF16Plus.isLeadSurrogate(c)) {
1016                     break;
1017                 } else {
1018                     char c2;
1019                     if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
1020                         c = Character.toCodePoint((char)c, c2);
1021                         norm16 = normTrie.suppGet(c);
1022                         if (isMostDecompYesAndZeroCC(norm16)) {
1023                             src += 2;
1024                         } else {
1025                             break;
1026                         }
1027                     } else {
1028                         ++src;  // unpaired lead surrogate: inert
1029                     }
1030                 }
1031             }
1032             // copy these code units all at once
1033             if(src!=prevSrc) {
1034                 if(buffer!=null) {
1035                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
1036                 } else {
1037                     prevCC=0;
1038                     prevBoundary=src;
1039                 }
1040             }
1041             if(src==limit) {
1042                 break;
1043             }
1044 
1045             // Check one above-minimum, relevant code point.
1046             src+=Character.charCount(c);
1047             if(buffer!=null) {
1048                 decompose(c, norm16, buffer);
1049             } else {
1050                 if(isDecompYes(norm16)) {
1051                     int cc=getCCFromYesOrMaybe(norm16);
1052                     if(prevCC<=cc || cc==0) {
1053                         prevCC=cc;
1054                         if(cc<=1) {
1055                             prevBoundary=src;
1056                         }
1057                         continue;
1058                     }
1059                 }
1060                 return prevBoundary;  // "no" or cc out of order
1061             }
1062         }
1063         return src;
1064     }
decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer)1065     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
1066         int limit=s.length();
1067         if(limit==0) {
1068             return;
1069         }
1070         if(doDecompose) {
1071             decompose(s, 0, limit, buffer);
1072             return;
1073         }
1074         // Just merge the strings at the boundary.
1075         int c=Character.codePointAt(s, 0);
1076         int src=0;
1077         int firstCC, prevCC, cc;
1078         firstCC=prevCC=cc=getCC(getNorm16(c));
1079         while(cc!=0) {
1080             prevCC=cc;
1081             src+=Character.charCount(c);
1082             if(src>=limit) {
1083                 break;
1084             }
1085             c=Character.codePointAt(s, src);
1086             cc=getCC(getNorm16(c));
1087         };
1088         buffer.append(s, 0, src, false, firstCC, prevCC);
1089         buffer.append(s, src, limit);
1090     }
1091 
1092     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1093     // doCompose: normalize
1094     // !doCompose: isNormalized (buffer must be empty and initialized)
compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer)1095     public boolean compose(CharSequence s, int src, int limit,
1096                            boolean onlyContiguous,
1097                            boolean doCompose,
1098                            ReorderingBuffer buffer) {
1099         int prevBoundary=src;
1100         int minNoMaybeCP=minCompNoMaybeCP;
1101 
1102         for (;;) {
1103             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1104             // or with (compYes && ccc==0) properties.
1105             int prevSrc;
1106             int c = 0;
1107             int norm16 = 0;
1108             for (;;) {
1109                 if (src == limit) {
1110                     if (prevBoundary != limit && doCompose) {
1111                         buffer.append(s, prevBoundary, limit);
1112                     }
1113                     return true;
1114                 }
1115                 if( (c=s.charAt(src))<minNoMaybeCP ||
1116                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
1117                 ) {
1118                     ++src;
1119                 } else {
1120                     prevSrc = src++;
1121                     if (!UTF16Plus.isLeadSurrogate(c)) {
1122                         break;
1123                     } else {
1124                         char c2;
1125                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
1126                             ++src;
1127                             c = Character.toCodePoint((char)c, c2);
1128                             norm16 = normTrie.suppGet(c);
1129                             if (!isCompYesAndZeroCC(norm16)) {
1130                                 break;
1131                             }
1132                         }
1133                     }
1134                 }
1135             }
1136             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1137             // The current character is either a "noNo" (has a mapping)
1138             // or a "maybeYes" (combines backward)
1139             // or a "yesYes" with ccc!=0.
1140             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1141 
1142             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1143             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1144                 if (!doCompose) {
1145                     return false;
1146                 }
1147                 // Fast path for mapping a character that is immediately surrounded by boundaries.
1148                 // In this case, we need not decompose around the current character.
1149                 if (isDecompNoAlgorithmic(norm16)) {
1150                     // Maps to a single isCompYesAndZeroCC character
1151                     // which also implies hasCompBoundaryBefore.
1152                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1153                             hasCompBoundaryBefore(s, src, limit)) {
1154                         if (prevBoundary != prevSrc) {
1155                             buffer.append(s, prevBoundary, prevSrc);
1156                         }
1157                         buffer.append(mapAlgorithmic(c, norm16), 0);
1158                         prevBoundary = src;
1159                         continue;
1160                     }
1161                 } else if (norm16 < minNoNoCompBoundaryBefore) {
1162                     // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1163                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1164                             hasCompBoundaryBefore(s, src, limit)) {
1165                         if (prevBoundary != prevSrc) {
1166                             buffer.append(s, prevBoundary, prevSrc);
1167                         }
1168                         int mapping = norm16 >> OFFSET_SHIFT;
1169                         int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
1170                         buffer.append(extraData, mapping, mapping + length);
1171                         prevBoundary = src;
1172                         continue;
1173                     }
1174                 } else if (norm16 >= minNoNoEmpty) {
1175                     // The current character maps to nothing.
1176                     // Simply omit it from the output if there is a boundary before _or_ after it.
1177                     // The character itself implies no boundaries.
1178                     if (hasCompBoundaryBefore(s, src, limit) ||
1179                             hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
1180                         if (prevBoundary != prevSrc) {
1181                             buffer.append(s, prevBoundary, prevSrc);
1182                         }
1183                         prevBoundary = src;
1184                         continue;
1185                     }
1186                 }
1187                 // Other "noNo" type, or need to examine more text around this character:
1188                 // Fall through to the slow path.
1189             } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1190                 char prev=s.charAt(prevSrc-1);
1191                 if(c<Hangul.JAMO_T_BASE) {
1192                     // The current character is a Jamo Vowel,
1193                     // compose with previous Jamo L and following Jamo T.
1194                     char l = (char)(prev-Hangul.JAMO_L_BASE);
1195                     if(l<Hangul.JAMO_L_COUNT) {
1196                         if (!doCompose) {
1197                             return false;
1198                         }
1199                         int t;
1200                         if (src != limit &&
1201                                 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
1202                                 t < Hangul.JAMO_T_COUNT) {
1203                             // The next character is a Jamo T.
1204                             ++src;
1205                         } else if (hasCompBoundaryBefore(s, src, limit)) {
1206                             // No Jamo T follows, not even via decomposition.
1207                             t = 0;
1208                         } else {
1209                             t = -1;
1210                         }
1211                         if (t >= 0) {
1212                             int syllable = Hangul.HANGUL_BASE +
1213                                 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
1214                                 Hangul.JAMO_T_COUNT + t;
1215                             --prevSrc;  // Replace the Jamo L as well.
1216                             if (prevBoundary != prevSrc) {
1217                                 buffer.append(s, prevBoundary, prevSrc);
1218                             }
1219                             buffer.append((char)syllable);
1220                             prevBoundary = src;
1221                             continue;
1222                         }
1223                         // If we see L+V+x where x!=T then we drop to the slow path,
1224                         // decompose and recompose.
1225                         // This is to deal with NFKC finding normal L and V but a
1226                         // compatibility variant of a T.
1227                         // We need to either fully compose that combination here
1228                         // (which would complicate the code and may not work with strange custom data)
1229                         // or use the slow path.
1230                     }
1231                 } else if (Hangul.isHangulLV(prev)) {
1232                     // The current character is a Jamo Trailing consonant,
1233                     // compose with previous Hangul LV that does not contain a Jamo T.
1234                     if (!doCompose) {
1235                         return false;
1236                     }
1237                     int syllable = prev + c - Hangul.JAMO_T_BASE;
1238                     --prevSrc;  // Replace the Hangul LV as well.
1239                     if (prevBoundary != prevSrc) {
1240                         buffer.append(s, prevBoundary, prevSrc);
1241                     }
1242                     buffer.append((char)syllable);
1243                     prevBoundary = src;
1244                     continue;
1245                 }
1246                 // No matching context, or may need to decompose surrounding text first:
1247                 // Fall through to the slow path.
1248             } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1249                 // One or more combining marks that do not combine-back:
1250                 // Check for canonical order, copy unchanged if ok and
1251                 // if followed by a character with a boundary-before.
1252                 int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1253                 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
1254                     // Fails FCD test, need to decompose and contiguously recompose.
1255                     if (!doCompose) {
1256                         return false;
1257                     }
1258                 } else {
1259                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1260                     // the previous character which passed the quick check "yes && ccc==0" test.
1261                     int n16;
1262                     for (;;) {
1263                         if (src == limit) {
1264                             if (doCompose) {
1265                                 buffer.append(s, prevBoundary, limit);
1266                             }
1267                             return true;
1268                         }
1269                         int prevCC = cc;
1270                         c = Character.codePointAt(s, src);
1271                         n16 = normTrie.get(c);
1272                         if (n16 >= MIN_YES_YES_WITH_CC) {
1273                             cc = getCCFromNormalYesOrMaybe(n16);
1274                             if (prevCC > cc) {
1275                                 if (!doCompose) {
1276                                     return false;
1277                                 }
1278                                 break;
1279                             }
1280                         } else {
1281                             break;
1282                         }
1283                         src += Character.charCount(c);
1284                     }
1285                     // p is after the last in-order combining mark.
1286                     // If there is a boundary here, then we continue with no change.
1287                     if (norm16HasCompBoundaryBefore(n16)) {
1288                         if (isCompYesAndZeroCC(n16)) {
1289                             src += Character.charCount(c);
1290                         }
1291                         continue;
1292                     }
1293                     // Use the slow path. There is no boundary in [prevSrc, src[.
1294                 }
1295             }
1296 
1297             // Slow path: Find the nearest boundaries around the current character,
1298             // decompose and recompose.
1299             if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1300                 c = Character.codePointBefore(s, prevSrc);
1301                 norm16 = normTrie.get(c);
1302                 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1303                     prevSrc -= Character.charCount(c);
1304                 }
1305             }
1306             if (doCompose && prevBoundary != prevSrc) {
1307                 buffer.append(s, prevBoundary, prevSrc);
1308             }
1309             int recomposeStartIndex=buffer.length();
1310             // We know there is not a boundary here.
1311             decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
1312                            buffer);
1313             // Decompose until the next boundary.
1314             src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
1315                                  buffer);
1316             recompose(buffer, recomposeStartIndex, onlyContiguous);
1317             if(!doCompose) {
1318                 if(!buffer.equals(s, prevSrc, src)) {
1319                     return false;
1320                 }
1321                 buffer.remove();
1322             }
1323             prevBoundary=src;
1324         }
1325     }
1326 
1327     /**
1328      * Very similar to compose(): Make the same changes in both places if relevant.
1329      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
1330      * !doSpan: quickCheck
1331      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
1332      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
1333      *         then the quick check result is "no"
1334      */
composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan)1335     public int composeQuickCheck(CharSequence s, int src, int limit,
1336                                  boolean onlyContiguous, boolean doSpan) {
1337         int qcResult=0;
1338         int prevBoundary=src;
1339         int minNoMaybeCP=minCompNoMaybeCP;
1340 
1341         for(;;) {
1342             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1343             // or with (compYes && ccc==0) properties.
1344             int prevSrc;
1345             int c = 0;
1346             int norm16 = 0;
1347             for (;;) {
1348                 if(src==limit) {
1349                     return (src<<1)|qcResult;  // "yes" or "maybe"
1350                 }
1351                 if( (c=s.charAt(src))<minNoMaybeCP ||
1352                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
1353                 ) {
1354                     ++src;
1355                 } else {
1356                     prevSrc = src++;
1357                     if (!UTF16Plus.isLeadSurrogate(c)) {
1358                         break;
1359                     } else {
1360                         char c2;
1361                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
1362                             ++src;
1363                             c = Character.toCodePoint((char)c, c2);
1364                             norm16 = normTrie.suppGet(c);
1365                             if (!isCompYesAndZeroCC(norm16)) {
1366                                 break;
1367                             }
1368                         }
1369                     }
1370                 }
1371             }
1372             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1373             // The current character is either a "noNo" (has a mapping)
1374             // or a "maybeYes" (combines backward)
1375             // or a "yesYes" with ccc!=0.
1376             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1377 
1378             int prevNorm16 = INERT;
1379             if (prevBoundary != prevSrc) {
1380                 prevBoundary = prevSrc;
1381                 if (!norm16HasCompBoundaryBefore(norm16)) {
1382                     c = Character.codePointBefore(s, prevSrc);
1383                     int n16 = getNorm16(c);
1384                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1385                         prevBoundary -= Character.charCount(c);
1386                         prevNorm16 = n16;
1387                     }
1388                 }
1389             }
1390 
1391             if(isMaybeOrNonZeroCC(norm16)) {
1392                 int cc=getCCFromYesOrMaybe(norm16);
1393                 if (onlyContiguous /* FCC */ && cc != 0 &&
1394                         getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1395                     // The [prevBoundary..prevSrc[ character
1396                     // passed the quick check "yes && ccc==0" test
1397                     // but is out of canonical order with the current combining mark.
1398                 } else {
1399                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1400                     // the previous character which passed the quick check "yes && ccc==0" test.
1401                     for (;;) {
1402                         if (norm16 < MIN_YES_YES_WITH_CC) {
1403                             if (!doSpan) {
1404                                 qcResult = 1;
1405                             } else {
1406                                 return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
1407                             }
1408                         }
1409                         if (src == limit) {
1410                             return (src<<1) | qcResult;  // "yes" or "maybe"
1411                         }
1412                         int prevCC = cc;
1413                         c = Character.codePointAt(s, src);
1414                         norm16 = getNorm16(c);
1415                         if (isMaybeOrNonZeroCC(norm16)) {
1416                             cc = getCCFromYesOrMaybe(norm16);
1417                             if (!(prevCC <= cc || cc == 0)) {
1418                                 break;
1419                             }
1420                         } else {
1421                             break;
1422                         }
1423                         src += Character.charCount(c);
1424                     }
1425                     // src is after the last in-order combining mark.
1426                     if (isCompYesAndZeroCC(norm16)) {
1427                         prevBoundary = src;
1428                         src += Character.charCount(c);
1429                         continue;
1430                     }
1431                 }
1432             }
1433             return prevBoundary<<1;  // "no"
1434         }
1435     }
composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer)1436     public void composeAndAppend(CharSequence s,
1437                                  boolean doCompose,
1438                                  boolean onlyContiguous,
1439                                  ReorderingBuffer buffer) {
1440         int src=0, limit=s.length();
1441         if(!buffer.isEmpty()) {
1442             int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1443             if(0!=firstStarterInSrc) {
1444                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1445                                                                buffer.length(), onlyContiguous);
1446                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1447                                                        firstStarterInSrc+16);
1448                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1449                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1450                 middle.append(s, 0, firstStarterInSrc);
1451                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1452                 src=firstStarterInSrc;
1453             }
1454         }
1455         if(doCompose) {
1456             compose(s, src, limit, onlyContiguous, true, buffer);
1457         } else {
1458             buffer.append(s, src, limit);
1459         }
1460     }
1461     // Dual functionality:
1462     // buffer!=NULL: normalize
1463     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer)1464     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1465         // Note: In this function we use buffer->appendZeroCC() because we track
1466         // the lead and trail combining classes here, rather than leaving it to
1467         // the ReorderingBuffer.
1468         // The exception is the call to decomposeShort() which uses the buffer
1469         // in the normal way.
1470 
1471         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1472         // Similar to the prevBoundary in the compose() implementation.
1473         int prevBoundary=src;
1474         int prevSrc;
1475         int c=0;
1476         int prevFCD16=0;
1477         int fcd16=0;
1478 
1479         for(;;) {
1480             // count code units with lccc==0
1481             for(prevSrc=src; src!=limit;) {
1482                 if((c=s.charAt(src))<minLcccCP) {
1483                     prevFCD16=~c;
1484                     ++src;
1485                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1486                     prevFCD16=0;
1487                     ++src;
1488                 } else {
1489                     if (UTF16Plus.isLeadSurrogate(c)) {
1490                         char c2;
1491                         if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
1492                             c = Character.toCodePoint((char)c, c2);
1493                         }
1494                     }
1495                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1496                         prevFCD16=fcd16;
1497                         src+=Character.charCount(c);
1498                     } else {
1499                         break;
1500                     }
1501                 }
1502             }
1503             // copy these code units all at once
1504             if(src!=prevSrc) {
1505                 if(src==limit) {
1506                     if(buffer!=null) {
1507                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1508                     }
1509                     break;
1510                 }
1511                 prevBoundary=src;
1512                 // We know that the previous character's lccc==0.
1513                 if(prevFCD16<0) {
1514                     // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1515                     int prev=~prevFCD16;
1516                     if(prev<minDecompNoCP) {
1517                         prevFCD16=0;
1518                     } else {
1519                         prevFCD16=getFCD16FromNormData(prev);
1520                         if(prevFCD16>1) {
1521                             --prevBoundary;
1522                         }
1523                     }
1524                 } else {
1525                     int p=src-1;
1526                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1527                         Character.isHighSurrogate(s.charAt(p-1))
1528                     ) {
1529                         --p;
1530                         // Need to fetch the previous character's FCD value because
1531                         // prevFCD16 was just for the trail surrogate code point.
1532                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1533                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1534                     }
1535                     if(prevFCD16>1) {
1536                         prevBoundary=p;
1537                     }
1538                 }
1539                 if(buffer!=null) {
1540                     // The last lccc==0 character is excluded from the
1541                     // flush-and-append call in case it needs to be modified.
1542                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1543                     buffer.append(s, prevBoundary, src);
1544                 }
1545                 // The start of the current character (c).
1546                 prevSrc=src;
1547             } else if(src==limit) {
1548                 break;
1549             }
1550 
1551             src+=Character.charCount(c);
1552             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1553             // Check for proper order, and decompose locally if necessary.
1554             if((prevFCD16&0xff)<=(fcd16>>8)) {
1555                 // proper order: prev tccc <= current lccc
1556                 if((fcd16&0xff)<=1) {
1557                     prevBoundary=src;
1558                 }
1559                 if(buffer!=null) {
1560                     buffer.appendZeroCC(c);
1561                 }
1562                 prevFCD16=fcd16;
1563                 continue;
1564             } else if(buffer==null) {
1565                 return prevBoundary;  // quick check "no"
1566             } else {
1567                 /*
1568                  * Back out the part of the source that we copied or appended
1569                  * already but is now going to be decomposed.
1570                  * prevSrc is set to after what was copied/appended.
1571                  */
1572                 buffer.removeSuffix(prevSrc-prevBoundary);
1573                 /*
1574                  * Find the part of the source that needs to be decomposed,
1575                  * up to the next safe boundary.
1576                  */
1577                 src=findNextFCDBoundary(s, src, limit);
1578                 /*
1579                  * The source text does not fulfill the conditions for FCD.
1580                  * Decompose and reorder a limited piece of the text.
1581                  */
1582                 decomposeShort(s, prevBoundary, src, false, false, buffer);
1583                 prevBoundary=src;
1584                 prevFCD16=0;
1585             }
1586         }
1587         return src;
1588     }
makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer)1589     public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
1590         int src=0, limit=s.length();
1591         if(!buffer.isEmpty()) {
1592             int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
1593             if(0!=firstBoundaryInSrc) {
1594                 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
1595                                                                buffer.length());
1596                 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
1597                                                        firstBoundaryInSrc+16);
1598                 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
1599                 buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
1600                 middle.append(s, 0, firstBoundaryInSrc);
1601                 makeFCD(middle, 0, middle.length(), buffer);
1602                 src=firstBoundaryInSrc;
1603             }
1604         }
1605         if(doMakeFCD) {
1606             makeFCD(s, src, limit, buffer);
1607         } else {
1608             buffer.append(s, src, limit);
1609         }
1610     }
1611 
hasDecompBoundaryBefore(int c)1612     public boolean hasDecompBoundaryBefore(int c) {
1613         return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1614             norm16HasDecompBoundaryBefore(getNorm16(c));
1615     }
norm16HasDecompBoundaryBefore(int norm16)1616     public boolean norm16HasDecompBoundaryBefore(int norm16) {
1617         if (norm16 < minNoNoCompNoMaybeCC) {
1618             return true;
1619         }
1620         if (norm16 >= limitNoNo) {
1621             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1622         }
1623         // c decomposes, get everything from the variable-length extra data
1624         int mapping=norm16>>OFFSET_SHIFT;
1625         int firstUnit=extraData.charAt(mapping);
1626         // true if leadCC==0 (hasFCDBoundaryBefore())
1627         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1628     }
hasDecompBoundaryAfter(int c)1629     public boolean hasDecompBoundaryAfter(int c) {
1630         if (c < minDecompNoCP) {
1631             return true;
1632         }
1633         if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1634             return true;
1635         }
1636         return norm16HasDecompBoundaryAfter(getNorm16(c));
1637     }
norm16HasDecompBoundaryAfter(int norm16)1638     public boolean norm16HasDecompBoundaryAfter(int norm16) {
1639         if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1640             return true;
1641         }
1642         if (norm16 >= limitNoNo) {
1643             if (isMaybeOrNonZeroCC(norm16)) {
1644                 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1645             }
1646             // Maps to an isCompYesAndZeroCC.
1647             return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1648         }
1649         // c decomposes, get everything from the variable-length extra data
1650         int mapping=norm16>>OFFSET_SHIFT;
1651         int firstUnit=extraData.charAt(mapping);
1652         // decomp after-boundary: same as hasFCDBoundaryAfter(),
1653         // fcd16<=1 || trailCC==0
1654         if(firstUnit>0x1ff) {
1655             return false;  // trailCC>1
1656         }
1657         if(firstUnit<=0xff) {
1658             return true;  // trailCC==0
1659         }
1660         // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1661         // true if leadCC==0 (hasFCDBoundaryBefore())
1662         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1663     }
isDecompInert(int c)1664     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1665 
hasCompBoundaryBefore(int c)1666     public boolean hasCompBoundaryBefore(int c) {
1667         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1668     }
hasCompBoundaryAfter(int c, boolean onlyContiguous)1669     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1670         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1671     }
isCompInert(int c, boolean onlyContiguous)1672     public boolean isCompInert(int c, boolean onlyContiguous) {
1673         int norm16=getNorm16(c);
1674         return isCompYesAndZeroCC(norm16) &&
1675             (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1676             (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff);
1677     }
1678 
hasFCDBoundaryBefore(int c)1679     public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); }
hasFCDBoundaryAfter(int c)1680     public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); }
isFCDInert(int c)1681     public boolean isFCDInert(int c) { return getFCD16(c)<=1; }
1682 
isMaybe(int norm16)1683     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
isMaybeOrNonZeroCC(int norm16)1684     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
isInert(int norm16)1685     private static boolean isInert(int norm16) { return norm16==INERT; }
isJamoL(int norm16)1686     private static boolean isJamoL(int norm16) { return norm16==JAMO_L; }
isJamoVT(int norm16)1687     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
hangulLVT()1688     private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
isHangulLV(int norm16)1689     private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
isHangulLVT(int norm16)1690     private boolean isHangulLVT(int norm16) {
1691         return norm16==hangulLVT();
1692     }
isCompYesAndZeroCC(int norm16)1693     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1694     // UBool isCompYes(uint16_t norm16) const {
1695     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1696     // }
1697     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1698     //     return norm16<minNoNo || minMaybeYes<=norm16;
1699     // }
1700     // private boolean hasZeroCCFromDecompYes(int norm16) {
1701     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1702     // }
isDecompYesAndZeroCC(int norm16)1703     private boolean isDecompYesAndZeroCC(int norm16) {
1704         return norm16<minYesNo ||
1705                norm16==JAMO_VT ||
1706                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1707     }
1708     /**
1709      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1710      * the MaybeYes which combine-forward and have ccc=0.
1711      * (Standard Unicode 10 normalization does not have such characters.)
1712      */
isMostDecompYesAndZeroCC(int norm16)1713     private boolean isMostDecompYesAndZeroCC(int norm16) {
1714         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1715     }
isDecompNoAlgorithmic(int norm16)1716     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1717 
1718     // For use with isCompYes().
1719     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1720     // static uint8_t getCCFromYes(uint16_t norm16) {
1721     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1722     // }
getCCFromNoNo(int norm16)1723     private int getCCFromNoNo(int norm16) {
1724         int mapping=norm16>>OFFSET_SHIFT;
1725         if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1726             return extraData.charAt(mapping-1)&0xff;
1727         } else {
1728             return 0;
1729         }
1730     }
getTrailCCFromCompYesAndZeroCC(int norm16)1731     int getTrailCCFromCompYesAndZeroCC(int norm16) {
1732         if(norm16<=minYesNo) {
1733             return 0;  // yesYes and Hangul LV have ccc=tccc=0
1734         } else {
1735             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1736             return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
1737         }
1738     }
1739 
1740     // Requires algorithmic-NoNo.
mapAlgorithmic(int c, int norm16)1741     private int mapAlgorithmic(int c, int norm16) {
1742         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1743     }
1744 
1745     // Requires minYesNo<norm16<limitNoNo.
1746     // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1747 
1748     /**
1749      * @return index into maybeYesCompositions, or -1
1750      */
getCompositionsListForDecompYes(int norm16)1751     private int getCompositionsListForDecompYes(int norm16) {
1752         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1753             return -1;
1754         } else {
1755             if((norm16-=minMaybeYes)<0) {
1756                 // norm16<minMaybeYes: index into extraData which is a substring at
1757                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1758                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1759                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1760             }
1761             return norm16>>OFFSET_SHIFT;
1762         }
1763     }
1764     /**
1765      * @return index into maybeYesCompositions
1766      */
getCompositionsListForComposite(int norm16)1767     private int getCompositionsListForComposite(int norm16) {
1768         // A composite has both mapping & compositions list.
1769         int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1770         int firstUnit=maybeYesCompositions.charAt(list);
1771         return list+  // mapping in maybeYesCompositions
1772             1+  // +1 to skip the first unit with the mapping length
1773             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1774     }
getCompositionsListForMaybe(int norm16)1775     private int getCompositionsListForMaybe(int norm16) {
1776         // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
1777         return (norm16-minMaybeYes)>>OFFSET_SHIFT;
1778     }
1779     /**
1780      * @param c code point must have compositions
1781      * @return index into maybeYesCompositions
1782      */
getCompositionsList(int norm16)1783     private int getCompositionsList(int norm16) {
1784         return isDecompYes(norm16) ?
1785                 getCompositionsListForDecompYes(norm16) :
1786                 getCompositionsListForComposite(norm16);
1787     }
1788 
1789     // Decompose a short piece of text which is likely to contain characters that
1790     // fail the quick check loop and/or where the quick check loop's overhead
1791     // is unlikely to be amortized.
1792     // Called by the compose() and makeFCD() implementations.
1793     // Public in Java for collation implementation code.
decomposeShort( CharSequence s, int src, int limit, boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer)1794     private int decomposeShort(
1795             CharSequence s, int src, int limit,
1796             boolean stopAtCompBoundary, boolean onlyContiguous,
1797             ReorderingBuffer buffer) {
1798         while(src<limit) {
1799             int c=Character.codePointAt(s, src);
1800             if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1801                 return src;
1802             }
1803             int norm16 = getNorm16(c);
1804             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1805                 return src;
1806             }
1807             src+=Character.charCount(c);
1808             decompose(c, norm16, buffer);
1809             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1810                 return src;
1811             }
1812         }
1813         return src;
1814     }
decompose(int c, int norm16, ReorderingBuffer buffer)1815     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1816         // get the decomposition and the lead and trail cc's
1817         if (norm16 >= limitNoNo) {
1818             if (isMaybeOrNonZeroCC(norm16)) {
1819                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1820                 return;
1821             }
1822             // Maps to an isCompYesAndZeroCC.
1823             c=mapAlgorithmic(c, norm16);
1824             norm16 = getRawNorm16(c);
1825         }
1826         if (norm16 < minYesNo) {
1827             // c does not decompose
1828             buffer.append(c, 0);
1829         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1830             // Hangul syllable: decompose algorithmically
1831             Hangul.decompose(c, buffer);
1832         } else {
1833             // c decomposes, get everything from the variable-length extra data
1834             int mapping=norm16>>OFFSET_SHIFT;
1835             int firstUnit=extraData.charAt(mapping);
1836             int length=firstUnit&MAPPING_LENGTH_MASK;
1837             int leadCC, trailCC;
1838             trailCC=firstUnit>>8;
1839             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1840                 leadCC=extraData.charAt(mapping-1)>>8;
1841             } else {
1842                 leadCC=0;
1843             }
1844             ++mapping;  // skip over the firstUnit
1845             buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
1846         }
1847     }
1848 
1849     /**
1850      * Finds the recomposition result for
1851      * a forward-combining "lead" character,
1852      * specified with a pointer to its compositions list,
1853      * and a backward-combining "trail" character.
1854      *
1855      * <p>If the lead and trail characters combine, then this function returns
1856      * the following "compositeAndFwd" value:
1857      * <pre>
1858      * Bits 21..1  composite character
1859      * Bit      0  set if the composite is a forward-combining starter
1860      * </pre>
1861      * otherwise it returns -1.
1862      *
1863      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1864      * encoded as either pairs or triples of 16-bit units.
1865      * The last entry has the high bit of its first unit set.
1866      *
1867      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1868      * A linear search is used.
1869      *
1870      * <p>See normalizer2impl.h for a more detailed description
1871      * of the compositions list format.
1872      */
combine(String compositions, int list, int trail)1873     private static int combine(String compositions, int list, int trail) {
1874         int key1, firstUnit;
1875         if(trail<COMP_1_TRAIL_LIMIT) {
1876             // trail character is 0..33FF
1877             // result entry may have 2 or 3 units
1878             key1=(trail<<1);
1879             while(key1>(firstUnit=compositions.charAt(list))) {
1880                 list+=2+(firstUnit&COMP_1_TRIPLE);
1881             }
1882             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1883                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1884                     return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1885                 } else {
1886                     return compositions.charAt(list+1);
1887                 }
1888             }
1889         } else {
1890             // trail character is 3400..10FFFF
1891             // result entry has 3 units
1892             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1893             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1894             int secondUnit;
1895             for(;;) {
1896                 if(key1>(firstUnit=compositions.charAt(list))) {
1897                     list+=2+(firstUnit&COMP_1_TRIPLE);
1898                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1899                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1900                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1901                             break;
1902                         } else {
1903                             list+=3;
1904                         }
1905                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1906                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1907                     } else {
1908                         break;
1909                     }
1910                 } else {
1911                     break;
1912                 }
1913             }
1914         }
1915         return -1;
1916     }
1917     /**
1918      * @param list some character's compositions list
1919      * @param set recursively receives the composites from these compositions
1920      */
addComposites(int list, UnicodeSet set)1921     private void addComposites(int list, UnicodeSet set) {
1922         int firstUnit, compositeAndFwd;
1923         do {
1924             firstUnit=maybeYesCompositions.charAt(list);
1925             if((firstUnit&COMP_1_TRIPLE)==0) {
1926                 compositeAndFwd=maybeYesCompositions.charAt(list+1);
1927                 list+=2;
1928             } else {
1929                 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
1930                                 maybeYesCompositions.charAt(list+2);
1931                 list+=3;
1932             }
1933             int composite=compositeAndFwd>>1;
1934             if((compositeAndFwd&1)!=0) {
1935                 addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
1936             }
1937             set.add(composite);
1938         } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1939     }
1940     /*
1941      * Recomposes the buffer text starting at recomposeStartIndex
1942      * (which is in NFD - decomposed and canonically ordered),
1943      * and truncates the buffer contents.
1944      *
1945      * Note that recomposition never lengthens the text:
1946      * Any character consists of either one or two code units;
1947      * a composition may contain at most one more code unit than the original starter,
1948      * while the combining mark that is removed has at least one code unit.
1949      */
recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous)1950     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1951                            boolean onlyContiguous) {
1952         StringBuilder sb=buffer.getStringBuilder();
1953         int p=recomposeStartIndex;
1954         if(p==sb.length()) {
1955             return;
1956         }
1957 
1958         int starter, pRemove;
1959         int compositionsList;
1960         int c, compositeAndFwd;
1961         int norm16;
1962         int cc, prevCC;
1963         boolean starterIsSupplementary;
1964 
1965         // Some of the following variables are not used until we have a forward-combining starter
1966         // and are only initialized now to avoid compiler warnings.
1967         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1968         starter=-1;
1969         starterIsSupplementary=false;
1970         prevCC=0;
1971 
1972         for(;;) {
1973             c=sb.codePointAt(p);
1974             p+=Character.charCount(c);
1975             norm16=getNorm16(c);
1976             cc=getCCFromYesOrMaybe(norm16);
1977             if( // this character combines backward and
1978                 isMaybe(norm16) &&
1979                 // we have seen a starter that combines forward and
1980                 compositionsList>=0 &&
1981                 // the backward-combining character is not blocked
1982                 (prevCC<cc || prevCC==0)
1983             ) {
1984                 if(isJamoVT(norm16)) {
1985                     // c is a Jamo V/T, see if we can compose it with the previous character.
1986                     if(c<Hangul.JAMO_T_BASE) {
1987                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1988                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1989                         if(prev<Hangul.JAMO_L_COUNT) {
1990                             pRemove=p-1;
1991                             char syllable=(char)
1992                                 (Hangul.HANGUL_BASE+
1993                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1994                                  Hangul.JAMO_T_COUNT);
1995                             char t;
1996                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1997                                 ++p;
1998                                 syllable+=t;  // The next character was a Jamo T.
1999                             }
2000                             sb.setCharAt(starter, syllable);
2001                             // remove the Jamo V/T
2002                             sb.delete(pRemove, p);
2003                             p=pRemove;
2004                         }
2005                     }
2006                     /*
2007                      * No "else" for Jamo T:
2008                      * Since the input is in NFD, there are no Hangul LV syllables that
2009                      * a Jamo T could combine with.
2010                      * All Jamo Ts are combined above when handling Jamo Vs.
2011                      */
2012                     if(p==sb.length()) {
2013                         break;
2014                     }
2015                     compositionsList=-1;
2016                     continue;
2017                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
2018                     // The starter and the combining mark (c) do combine.
2019                     int composite=compositeAndFwd>>1;
2020 
2021                     // Remove the combining mark.
2022                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
2023                     sb.delete(pRemove, p);
2024                     p=pRemove;
2025                     // Replace the starter with the composite.
2026                     if(starterIsSupplementary) {
2027                         if(composite>0xffff) {
2028                             // both are supplementary
2029                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
2030                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
2031                         } else {
2032                             sb.setCharAt(starter, (char)c);
2033                             sb.deleteCharAt(starter+1);
2034                             // The composite is shorter than the starter,
2035                             // move the intermediate characters forward one.
2036                             starterIsSupplementary=false;
2037                             --p;
2038                         }
2039                     } else if(composite>0xffff) {
2040                         // The composite is longer than the starter,
2041                         // move the intermediate characters back one.
2042                         starterIsSupplementary=true;
2043                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
2044                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
2045                         ++p;
2046                     } else {
2047                         // both are on the BMP
2048                         sb.setCharAt(starter, (char)composite);
2049                     }
2050 
2051                     // Keep prevCC because we removed the combining mark.
2052 
2053                     if(p==sb.length()) {
2054                         break;
2055                     }
2056                     // Is the composite a starter that combines forward?
2057                     if((compositeAndFwd&1)!=0) {
2058                         compositionsList=
2059                             getCompositionsListForComposite(getRawNorm16(composite));
2060                     } else {
2061                         compositionsList=-1;
2062                     }
2063 
2064                     // We combined; continue with looking for compositions.
2065                     continue;
2066                 }
2067             }
2068 
2069             // no combination this time
2070             prevCC=cc;
2071             if(p==sb.length()) {
2072                 break;
2073             }
2074 
2075             // If c did not combine, then check if it is a starter.
2076             if(cc==0) {
2077                 // Found a new starter.
2078                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
2079                     // It may combine with something, prepare for it.
2080                     if(c<=0xffff) {
2081                         starterIsSupplementary=false;
2082                         starter=p-1;
2083                     } else {
2084                         starterIsSupplementary=true;
2085                         starter=p-2;
2086                     }
2087                 }
2088             } else if(onlyContiguous) {
2089                 // FCC: no discontiguous compositions; any intervening character blocks.
2090                 compositionsList=-1;
2091             }
2092         }
2093         buffer.flush();
2094     }
2095 
composePair(int a, int b)2096     public int composePair(int a, int b) {
2097         int norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16
2098         int list;
2099         if(isInert(norm16)) {
2100             return -1;
2101         } else if(norm16<minYesNoMappingsOnly) {
2102             // a combines forward.
2103             if(isJamoL(norm16)) {
2104                 b-=Hangul.JAMO_V_BASE;
2105                 if(0<=b && b<Hangul.JAMO_V_COUNT) {
2106                     return
2107                         (Hangul.HANGUL_BASE+
2108                          ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
2109                          Hangul.JAMO_T_COUNT);
2110                 } else {
2111                     return -1;
2112                 }
2113             } else if(isHangulLV(norm16)) {
2114                 b-=Hangul.JAMO_T_BASE;
2115                 if(0<b && b<Hangul.JAMO_T_COUNT) {  // not b==0!
2116                     return a+b;
2117                 } else {
2118                     return -1;
2119                 }
2120             } else {
2121                 // 'a' has a compositions list in extraData
2122                 list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
2123                 if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
2124                     list+=  // mapping pointer
2125                         1+  // +1 to skip the first unit with the mapping length
2126                         (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK);  // + mapping length
2127                 }
2128             }
2129         } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
2130             return -1;
2131         } else {
2132             list=getCompositionsListForMaybe(norm16);  // offset into maybeYesCompositions
2133         }
2134         if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
2135             return -1;
2136         }
2137         return combine(maybeYesCompositions, list, b)>>1;
2138     }
2139 
2140     /**
2141      * Does c have a composition boundary before it?
2142      * True if its decomposition begins with a character that has
2143      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
2144      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
2145      * (isCompYesAndZeroCC()) so we need not decompose.
2146      */
hasCompBoundaryBefore(int c, int norm16)2147     private boolean hasCompBoundaryBefore(int c, int norm16) {
2148         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
2149     }
norm16HasCompBoundaryBefore(int norm16)2150     private boolean norm16HasCompBoundaryBefore(int norm16) {
2151         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
2152     }
hasCompBoundaryBefore(CharSequence s, int src, int limit)2153     private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
2154         return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
2155     }
norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous)2156     private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
2157         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
2158             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
2159     }
hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous)2160     private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
2161         return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
2162     }
2163     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
isTrailCC01ForCompBoundaryAfter(int norm16)2164     private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
2165         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
2166             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
2167     }
2168 
findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous)2169     private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
2170         while(p>0) {
2171             int c=Character.codePointBefore(s, p);
2172             int norm16 = getNorm16(c);
2173             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2174                 break;
2175             }
2176             p-=Character.charCount(c);
2177             if(hasCompBoundaryBefore(c, norm16)) {
2178                 break;
2179             }
2180         }
2181         return p;
2182     }
findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous)2183     private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
2184         while(p<limit) {
2185             int c=Character.codePointAt(s, p);
2186             int norm16=normTrie.get(c);
2187             if(hasCompBoundaryBefore(c, norm16)) {
2188                 break;
2189             }
2190             p+=Character.charCount(c);
2191             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2192                 break;
2193             }
2194         }
2195         return p;
2196     }
2197 
findPreviousFCDBoundary(CharSequence s, int p)2198     private int findPreviousFCDBoundary(CharSequence s, int p) {
2199         while(p>0) {
2200             int c=Character.codePointBefore(s, p);
2201             int norm16;
2202             if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) {
2203                 break;
2204             }
2205             p-=Character.charCount(c);
2206             if (norm16HasDecompBoundaryBefore(norm16)) {
2207                 break;
2208             }
2209         }
2210         return p;
2211     }
findNextFCDBoundary(CharSequence s, int p, int limit)2212     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
2213         while(p<limit) {
2214             int c=Character.codePointAt(s, p);
2215             int norm16;
2216             if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
2217                 break;
2218             }
2219             p+=Character.charCount(c);
2220             if (norm16HasDecompBoundaryAfter(norm16)) {
2221                 break;
2222             }
2223         }
2224         return p;
2225     }
2226 
getPreviousTrailCC(CharSequence s, int start, int p)2227     private int getPreviousTrailCC(CharSequence s, int start, int p) {
2228         if (start == p) {
2229             return 0;
2230         }
2231         return getFCD16(Character.codePointBefore(s, p));
2232     }
2233 
addToStartSet(MutableCodePointTrie mutableTrie, int origin, int decompLead)2234     private void addToStartSet(MutableCodePointTrie mutableTrie, int origin, int decompLead) {
2235         int canonValue = mutableTrie.get(decompLead);
2236         if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2237             // origin is the first character whose decomposition starts with
2238             // the character for which we are setting the value.
2239             mutableTrie.set(decompLead, canonValue|origin);
2240         } else {
2241             // origin is not the first character, or it is U+0000.
2242             UnicodeSet set;
2243             if((canonValue&CANON_HAS_SET)==0) {
2244                 int firstOrigin=canonValue&CANON_VALUE_MASK;
2245                 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
2246                 mutableTrie.set(decompLead, canonValue);
2247                 canonStartSets.add(set=new UnicodeSet());
2248                 if(firstOrigin!=0) {
2249                     set.add(firstOrigin);
2250                 }
2251             } else {
2252                 set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
2253             }
2254             set.add(origin);
2255         }
2256     }
2257 
2258     @SuppressWarnings("unused")
2259     private VersionInfo dataVersion;
2260 
2261     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2262     private int minDecompNoCP;
2263     private int minCompNoMaybeCP;
2264     private int minLcccCP;
2265 
2266     // Norm16 value thresholds for quick check combinations and types of extra data.
2267     private int minYesNo;
2268     private int minYesNoMappingsOnly;
2269     private int minNoNo;
2270     private int minNoNoCompBoundaryBefore;
2271     private int minNoNoCompNoMaybeCC;
2272     private int minNoNoEmpty;
2273     private int limitNoNo;
2274     private int centerNoNoDelta;
2275     private int minMaybeYes;
2276 
2277     private CodePointTrie.Fast16 normTrie;
2278     private String maybeYesCompositions;
2279     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2280     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2281 
2282     private CodePointTrie canonIterData;
2283     private ArrayList<UnicodeSet> canonStartSets;
2284 
2285     // bits in canonIterData
2286     private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
2287     private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
2288     private static final int CANON_HAS_SET = 0x200000;
2289     private static final int CANON_VALUE_MASK = 0x1fffff;
2290 }
2291