1 /**
2 *******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7 package com.ibm.icu.text;
8 
9 import java.text.CharacterIterator;
10 import java.util.HashMap;
11 import java.util.Map;
12 
13 import com.ibm.icu.impl.CharacterIteratorWrapper;
14 import com.ibm.icu.impl.coll.Collation;
15 import com.ibm.icu.impl.coll.CollationData;
16 import com.ibm.icu.impl.coll.CollationIterator;
17 import com.ibm.icu.impl.coll.ContractionsAndExpansions;
18 import com.ibm.icu.impl.coll.FCDIterCollationIterator;
19 import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
20 import com.ibm.icu.impl.coll.IterCollationIterator;
21 import com.ibm.icu.impl.coll.UTF16CollationIterator;
22 import com.ibm.icu.impl.coll.UVector32;
23 
24 /**
25  * <p><code>CollationElementIterator</code> is an iterator created by
26  * a RuleBasedCollator to walk through a string. The return result of
27  * each iteration is a 32-bit collation element (CE) that defines the
28  * ordering priority of the next character or sequence of characters
29  * in the source string.</p>
30  *
31  * <p>For illustration, consider the following in Slovak and in traditional Spanish collation:
32  * <blockquote>
33  * <pre>
34  * "ca" -> the first collation element is CE('c') and the second
35  *         collation element is CE('a').
36  * "cha" -> the first collation element is CE('ch') and the second
37  *          collation element is CE('a').
38  * </pre>
39  * </blockquote>
40  * And in German phonebook collation,
41  * <blockquote>
42  * <pre>
43  * Since the character '&#230;' is a composed character of 'a' and 'e', the
44  * iterator returns two collation elements for the single character '&#230;'
45  *
46  * "&#230;b" -> the first collation element is collation_element('a'), the
47  *              second collation element is collation_element('e'), and the
48  *              third collation element is collation_element('b').
49  * </pre>
50  * </blockquote>
51  * </p>
52  *
53  * <p>For collation ordering comparison, the collation element results
54  * can not be compared simply by using basic arithmetic operators,
55  * e.g. &lt;, == or &gt;, further processing has to be done. Details
56  * can be found in the ICU
57  * <a href="http://userguide.icu-project.org/collation/architecture">
58  * User Guide</a>. An example of using the CollationElementIterator
59  * for collation ordering comparison is the class
60  * {@link com.ibm.icu.text.StringSearch}.</p>
61  *
62  * <p>To construct a CollationElementIterator object, users
63  * call the method getCollationElementIterator() on a
64  * RuleBasedCollator that defines the desired sorting order.</p>
65  *
66  * <p> Example:
67  * <blockquote>
68  * <pre>
69  *  String testString = "This is a test";
70  *  RuleBasedCollator rbc = new RuleBasedCollator("&amp;a&lt;b");
71  *  CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
72  *  int primaryOrder = iterator.IGNORABLE;
73  *  while (primaryOrder != iterator.NULLORDER) {
74  *      int order = iterator.next();
75  *      if (order != iterator.IGNORABLE &&
76  *          order != iterator.NULLORDER) {
77  *          // order is valid, not ignorable and we have not passed the end
78  *          // of the iteration, we do something
79  *          primaryOrder = CollationElementIterator.primaryOrder(order);
80  *          System.out.println("Next primary order 0x" +
81  *                             Integer.toHexString(primaryOrder));
82  *      }
83  *  }
84  * </pre>
85  * </blockquote>
86  * </p>
87  * <p>
88  * The method next() returns the collation order of the next character based on
89  * the comparison level of the collator. The method previous() returns the
90  * collation order of the previous character based on the comparison level of
91  * the collator. The Collation Element Iterator moves only in one direction
92  * between calls to reset(), setOffset(), or setText(). That is, next() and
93  * previous() can not be inter-used. Whenever previous() is to be called after
94  * next() or vice versa, reset(), setOffset() or setText() has to be called first
95  * to reset the status, shifting current position to either the end or the start of
96  * the string (reset() or setText()), or the specified position (setOffset()).
97  * Hence at the next call of next() or previous(), the first or last collation order,
98  * or collation order at the specified position will be returned. If a change of
99  * direction is done without one of these calls, the result is undefined.
100  * </p>
101  * <p>
102  * This class is not subclassable.
103  * </p>
104  * @see Collator
105  * @see RuleBasedCollator
106  * @see StringSearch
107  * @author Syn Wee Quek
108  * @stable ICU 2.8
109  */
110 public final class CollationElementIterator
111 {
112     private CollationIterator iter_;  // owned
113     private RuleBasedCollator rbc_;  // aliased
114     private int otherHalf_;
115     /**
116      * <0: backwards; 0: just after reset() (previous() begins from end);
117      * 1: just after setOffset(); >1: forward
118      */
119     private byte dir_;
120     /**
121      * Stores offsets from expansions and from unsafe-backwards iteration,
122      * so that getOffset() returns intermediate offsets for the CEs
123      * that are consistent with forward iteration.
124      */
125     private UVector32 offsets_;
126 
127     private String string_;  // TODO: needed in Java? if so, then add a UCharacterIterator field too?
128 
129 
130     /**
131      * <p>This constant is returned by the iterator in the methods
132      * next() and previous() when the end or the beginning of the
133      * source string has been reached, and there are no more valid
134      * collation elements to return.</p>
135      *
136      * <p>See class documentation for an example of use.</p>
137      * @stable ICU 2.8
138      * @see #next
139      * @see #previous */
140     public final static int NULLORDER = 0xffffffff;
141 
142     /**
143      * <p>This constant is returned by the iterator in the methods
144      * next() and previous() when a collation element result is to be
145      * ignored.</p>
146      *
147      * <p>See class documentation for an example of use.</p>
148      * @stable ICU 2.8
149      * @see #next
150      * @see #previous */
151     public static final int IGNORABLE = 0;
152 
153     /**
154      * Return the primary order of the specified collation element,
155      * i.e. the first 16 bits.  This value is unsigned.
156      * @param ce the collation element
157      * @return the element's 16 bits primary order.
158      * @stable ICU 2.8
159      */
primaryOrder(int ce)160     public final static int primaryOrder(int ce) {
161         return (ce >>> 16) & 0xffff;
162     }
163 
164     /**
165      * Return the secondary order of the specified collation element,
166      * i.e. the 16th to 23th bits, inclusive.  This value is unsigned.
167      * @param ce the collation element
168      * @return the element's 8 bits secondary order
169      * @stable ICU 2.8
170      */
secondaryOrder(int ce)171     public final static int secondaryOrder(int ce) {
172         return (ce >>> 8) & 0xff;
173     }
174 
175     /**
176      * Return the tertiary order of the specified collation element, i.e. the last
177      * 8 bits.  This value is unsigned.
178      * @param ce the collation element
179      * @return the element's 8 bits tertiary order
180      * @stable ICU 2.8
181      */
tertiaryOrder(int ce)182     public final static int tertiaryOrder(int ce) {
183         return ce & 0xff;
184     }
185 
186 
getFirstHalf(long p, int lower32)187     private static final int getFirstHalf(long p, int lower32) {
188         return ((int)p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
189     }
190 
getSecondHalf(long p, int lower32)191     private static final int getSecondHalf(long p, int lower32) {
192         return ((int)p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
193     }
194 
ceNeedsTwoParts(long ce)195     private static final boolean ceNeedsTwoParts(long ce) {
196         return (ce & 0xffff00ff003fL) != 0;
197     }
198 
CollationElementIterator(RuleBasedCollator collator)199     private CollationElementIterator(RuleBasedCollator collator) {
200         iter_ = null;
201         rbc_ = collator;
202         otherHalf_ = 0;
203         dir_ = 0;
204         offsets_ = null;
205     }
206 
207     /**
208      * <p>CollationElementIterator constructor. This takes a source
209      * string and a RuleBasedCollator. The iterator will walk through
210      * the source string based on the rules defined by the
211      * collator. If the source string is empty, NULLORDER will be
212      * returned on the first call to next().</p>
213      *
214      * @param source the source string.
215      * @param collator the RuleBasedCollator
216      * @stable ICU 2.8
217      */
CollationElementIterator(String source, RuleBasedCollator collator)218     CollationElementIterator(String source, RuleBasedCollator collator) {
219         this(collator);
220         setText(source);
221     }
222     // Note: The constructors should take settings & tailoring, not a collator,
223     // to avoid circular dependencies.
224     // However, for equals() we would need to be able to compare tailoring data for equality
225     // without making CollationData or CollationTailoring depend on TailoredSet.
226     // (See the implementation of RuleBasedCollator.equals().)
227     // That might require creating an intermediate class that would be used
228     // by both CollationElementIterator and RuleBasedCollator
229     // but only contain the part of RBC.equals() related to data and rules.
230 
231     /**
232      * <p>CollationElementIterator constructor. This takes a source
233      * character iterator and a RuleBasedCollator. The iterator will
234      * walk through the source string based on the rules defined by
235      * the collator. If the source string is empty, NULLORDER will be
236      * returned on the first call to next().</p>
237      *
238      * @param source the source string iterator.
239      * @param collator the RuleBasedCollator
240      * @stable ICU 2.8
241      */
CollationElementIterator(CharacterIterator source, RuleBasedCollator collator)242     CollationElementIterator(CharacterIterator source, RuleBasedCollator collator) {
243         this(collator);
244         setText(source);
245     }
246 
247     /**
248      * <p>CollationElementIterator constructor. This takes a source
249      * character iterator and a RuleBasedCollator. The iterator will
250      * walk through the source string based on the rules defined by
251      * the collator. If the source string is empty, NULLORDER will be
252      * returned on the first call to next().</p>
253      *
254      * @param source the source string iterator.
255      * @param collator the RuleBasedCollator
256      * @stable ICU 2.8
257      */
CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator)258     CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator) {
259         this(collator);
260         setText(source);
261     }
262 
263     /**
264      * <p>Returns the character offset in the source string
265      * corresponding to the next collation element. I.e., getOffset()
266      * returns the position in the source string corresponding to the
267      * collation element that will be returned by the next call to
268      * next() or previous(). This value could be any of:
269      * <ul>
270      * <li> The index of the <b>first</b> character corresponding to
271      * the next collation element. (This means that if
272      * <code>setOffset(offset)</code> sets the index in the middle of
273      * a contraction, <code>getOffset()</code> returns the index of
274      * the first character in the contraction, which may not be equal
275      * to the original offset that was set. Hence calling getOffset()
276      * immediately after setOffset(offset) does not guarantee that the
277      * original offset set will be returned.)
278      * <li> If normalization is on, the index of the <b>immediate</b>
279      * subsequent character, or composite character with the first
280      * character, having a combining class of 0.
281      * <li> The length of the source string, if iteration has reached
282      * the end.
283      *</ul>
284      * </p>
285      * @return The character offset in the source string corresponding to the
286      *         collation element that will be returned by the next call to
287      *         next() or previous().
288      * @stable ICU 2.8
289      */
getOffset()290     public int getOffset() {
291         if (dir_ < 0 && offsets_ != null && !offsets_.isEmpty()) {
292             // CollationIterator.previousCE() decrements the CEs length
293             // while it pops CEs from its internal buffer.
294             int i = iter_.getCEsLength();
295             if (otherHalf_ != 0) {
296                 // Return the trailing CE offset while we are in the middle of a 64-bit CE.
297                 ++i;
298             }
299             assert (i < offsets_.size());
300             return offsets_.elementAti(i);
301         }
302         return iter_.getOffset();
303     }
304 
305     /**
306      * <p>Get the next collation element in the source string.</p>
307      *
308      * <p>This iterator iterates over a sequence of collation elements
309      * that were built from the string. Because there isn't
310      * necessarily a one-to-one mapping from characters to collation
311      * elements, this doesn't mean the same thing as "return the
312      * collation element [or ordering priority] of the next character
313      * in the string".</p>
314      *
315      * <p>This function returns the collation element that the
316      * iterator is currently pointing to, and then updates the
317      * internal pointer to point to the next element.</p>
318      *
319      * @return the next collation element or NULLORDER if the end of the
320      *         iteration has been reached.
321      * @stable ICU 2.8
322      */
323     public int next() {
324         if (dir_ > 1) {
325             // Continue forward iteration. Test this first.
326             if (otherHalf_ != 0) {
327                 int oh = otherHalf_;
328                 otherHalf_ = 0;
329                 return oh;
330             }
331         } else if (dir_ == 1) {
332             // next() after setOffset()
333             dir_ = 2;
334         } else if (dir_ == 0) {
335             // The iter_ is already reset to the start of the text.
336             dir_ = 2;
337         } else /* dir_ < 0 */{
338             // illegal change of direction
339             throw new IllegalStateException("Illegal change of direction");
340             // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status.
341         }
342         // No need to keep all CEs in the buffer when we iterate.
343         iter_.clearCEsIfNoneRemaining();
344         long ce = iter_.nextCE();
345         if (ce == Collation.NO_CE) {
346             return NULLORDER;
347         }
348         // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
349         long p = ce >>> 32;
350         int lower32 = (int) ce;
351         int firstHalf = getFirstHalf(p, lower32);
352         int secondHalf = getSecondHalf(p, lower32);
353         if (secondHalf != 0) {
354             otherHalf_ = secondHalf | 0xc0; // continuation CE
355         }
356         return firstHalf;
357     }
358 
359     /**
360      * <p>Get the previous collation element in the source string.</p>
361      *
362      * <p>This iterator iterates over a sequence of collation elements
363      * that were built from the string. Because there isn't
364      * necessarily a one-to-one mapping from characters to collation
365      * elements, this doesn't mean the same thing as "return the
366      * collation element [or ordering priority] of the previous
367      * character in the string".</p>
368      *
369      * <p>This function updates the iterator's internal pointer to
370      * point to the collation element preceding the one it's currently
371      * pointing to and then returns that element, while next() returns
372      * the current element and then updates the pointer.</p>
373      *
374      * @return the previous collation element, or NULLORDER when the start of
375      *             the iteration has been reached.
376      * @stable ICU 2.8
377      */
previous()378     public int previous() {
379         if (dir_ < 0) {
380             // Continue backwards iteration. Test this first.
381             if (otherHalf_ != 0) {
382                 int oh = otherHalf_;
383                 otherHalf_ = 0;
384                 return oh;
385             }
386         } else if (dir_ == 0) {
387             iter_.resetToOffset(string_.length());
388             dir_ = -1;
389         } else if (dir_ == 1) {
390             // previous() after setOffset()
391             dir_ = -1;
392         } else /* dir_ > 1 */{
393             // illegal change of direction
394             throw new IllegalStateException("Illegal change of direction");
395             // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status.
396         }
397         if (offsets_ == null) {
398             offsets_ = new UVector32();
399         }
400         // If we already have expansion CEs, then we also have offsets.
401         // Otherwise remember the trailing offset in case we need to
402         // write offsets for an artificial expansion.
403         int limitOffset = iter_.getCEsLength() == 0 ? iter_.getOffset() : 0;
404         long ce = iter_.previousCE(offsets_);
405         if (ce == Collation.NO_CE) {
406             return NULLORDER;
407         }
408         // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
409         long p = ce >>> 32;
410         int lower32 = (int) ce;
411         int firstHalf = getFirstHalf(p, lower32);
412         int secondHalf = getSecondHalf(p, lower32);
413         if (secondHalf != 0) {
414             if (offsets_.isEmpty()) {
415                 // When we convert a single 64-bit CE into two 32-bit CEs,
416                 // we need to make this artificial expansion behave like a normal expansion.
417                 // See CollationIterator.previousCE().
418                 offsets_.addElement(iter_.getOffset());
419                 offsets_.addElement(limitOffset);
420             }
421             otherHalf_ = firstHalf;
422             return secondHalf | 0xc0; // continuation CE
423         }
424         return firstHalf;
425     }
426 
427     /**
428      * <p> Resets the cursor to the beginning of the string. The next
429      * call to next() or previous() will return the first and last
430      * collation element in the string, respectively.</p>
431      *
432      * <p>If the RuleBasedCollator used by this iterator has had its
433      * attributes changed, calling reset() will reinitialize the
434      * iterator to use the new attributes.</p>
435      *
436      * @stable ICU 2.8
437      */
reset()438     public void reset() {
439         iter_ .resetToOffset(0);
440         otherHalf_ = 0;
441         dir_ = 0;
442     }
443 
444     /**
445      * <p> Sets the iterator to point to the collation element
446      * corresponding to the character at the specified offset. The
447      * value returned by the next call to next() will be the collation
448      * element corresponding to the characters at offset.</p>
449      *
450      * <p>If offset is in the middle of a contracting character
451      * sequence, the iterator is adjusted to the start of the
452      * contracting sequence. This means that getOffset() is not
453      * guaranteed to return the same value set by this method.</p>
454      *
455      * <p>If the decomposition mode is on, and offset is in the middle
456      * of a decomposible range of source text, the iterator may not
457      * return a correct result for the next forwards or backwards
458      * iteration.  The user must ensure that the offset is not in the
459      * middle of a decomposible range.</p>
460      *
461      * @param newOffset the character offset into the original source string to
462      *        set. Note that this is not an offset into the corresponding
463      *        sequence of collation elements.
464      * @stable ICU 2.8
465      */
setOffset(int newOffset)466     public void setOffset(int newOffset) {
467         if (0 < newOffset && newOffset < string_.length()) {
468             int offset = newOffset;
469             do {
470                 char c = string_.charAt(offset);
471                 if (!rbc_.isUnsafe(c) ||
472                         (Character.isHighSurrogate(c) && !rbc_.isUnsafe(string_.codePointAt(offset)))) {
473                     break;
474                 }
475                 // Back up to before this unsafe character.
476                 --offset;
477             } while (offset > 0);
478             if (offset < newOffset) {
479                 // We might have backed up more than necessary.
480                 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
481                 // but for text "chu" setOffset(2) should remain at 2
482                 // although we initially back up to offset 0.
483                 // Find the last safe offset no greater than newOffset by iterating forward.
484                 int lastSafeOffset = offset;
485                 do {
486                     iter_.resetToOffset(lastSafeOffset);
487                     do {
488                         iter_.nextCE();
489                     } while ((offset = iter_.getOffset()) == lastSafeOffset);
490                     if (offset <= newOffset) {
491                         lastSafeOffset = offset;
492                     }
493                 } while (offset < newOffset);
494                 newOffset = lastSafeOffset;
495             }
496         }
497         iter_.resetToOffset(newOffset);
498         otherHalf_ = 0;
499         dir_ = 1;
500     }
501 
502     /**
503      * <p>Set a new source string for iteration, and reset the offset
504      * to the beginning of the text.</p>
505      *
506      * @param source the new source string for iteration.
507      * @stable ICU 2.8
508      */
setText(String source)509     public void setText(String source) {
510         string_ = source; // TODO: do we need to remember the source string in a field?
511         CollationIterator newIter;
512         boolean numeric = rbc_.settings.readOnly().isNumeric();
513         if (rbc_.settings.readOnly().dontCheckFCD()) {
514             newIter = new UTF16CollationIterator(rbc_.data, numeric, string_, 0);
515         } else {
516             newIter = new FCDUTF16CollationIterator(rbc_.data, numeric, string_, 0);
517         }
518         iter_ = newIter;
519         otherHalf_ = 0;
520         dir_ = 0;
521     }
522 
523     /**
524      * <p>Set a new source string iterator for iteration, and reset the
525      * offset to the beginning of the text.
526      * </p>
527      * <p>The source iterator's integrity will be preserved since a new copy
528      * will be created for use.</p>
529      * @param source the new source string iterator for iteration.
530      * @stable ICU 2.8
531      */
setText(UCharacterIterator source)532     public void setText(UCharacterIterator source) {
533         string_ = source.getText(); // TODO: do we need to remember the source string in a field?
534         // Note: In C++, we just setText(source.getText()).
535         // In Java, we actually operate on a character iterator.
536         // (The old code apparently did so only for a CharacterIterator;
537         // for a UCharacterIterator it also just used source.getText()).
538         // TODO: do we need to remember the cloned iterator in a field?
539         UCharacterIterator src;
540         try {
541             src = (UCharacterIterator) source.clone();
542         } catch (CloneNotSupportedException e) {
543             // Fall back to ICU 52 behavior of iterating over the text contents
544             // of the UCharacterIterator.
545             setText(source.getText());
546             return;
547         }
548         src.setToStart();
549         CollationIterator newIter;
550         boolean numeric = rbc_.settings.readOnly().isNumeric();
551         if (rbc_.settings.readOnly().dontCheckFCD()) {
552             newIter = new IterCollationIterator(rbc_.data, numeric, src);
553         } else {
554             newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
555         }
556         iter_ = newIter;
557         otherHalf_ = 0;
558         dir_ = 0;
559     }
560 
561     /**
562      * <p>Set a new source string iterator for iteration, and reset the
563      * offset to the beginning of the text.
564      * </p>
565      * @param source the new source string iterator for iteration.
566      * @stable ICU 2.8
567      */
setText(CharacterIterator source)568     public void setText(CharacterIterator source) {
569         // Note: In C++, we just setText(source.getText()).
570         // In Java, we actually operate on a character iterator.
571         // TODO: do we need to remember the iterator in a field?
572         // TODO: apparently we don't clone a CharacterIterator in Java,
573         // we only clone the text for a UCharacterIterator?? see the old code in the constructors
574         UCharacterIterator src = new CharacterIteratorWrapper(source);
575         src.setToStart();
576         string_ = src.getText(); // TODO: do we need to remember the source string in a field?
577         CollationIterator newIter;
578         boolean numeric = rbc_.settings.readOnly().isNumeric();
579         if (rbc_.settings.readOnly().dontCheckFCD()) {
580             newIter = new IterCollationIterator(rbc_.data, numeric, src);
581         } else {
582             newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
583         }
584         iter_ = newIter;
585         otherHalf_ = 0;
586         dir_ = 0;
587     }
588 
589     // Java porting note: This method is @stable ICU 2.0 in ICU4C, but not available
590     // in ICU4J. For now, keep it package local.
591     /**
592     * Gets the comparison order in the desired strength. Ignore the other
593     * differences.
594     * @param order The order value
595     */
strengthOrder(int order)596     int strengthOrder(int order) {
597         int s = rbc_.settings.readOnly().getStrength();
598         // Mask off the unwanted differences.
599         if (s == Collator.PRIMARY) {
600             order &= 0xffff0000;
601         }
602         else if (s == Collator.SECONDARY) {
603             order &= 0xffffff00;
604         }
605 
606         return order;
607     }
608 
609 
610     private static final class MaxExpSink implements ContractionsAndExpansions.CESink {
MaxExpSink(Map<Integer, Integer> h)611         MaxExpSink(Map<Integer, Integer> h) {
612             maxExpansions = h;
613         }
614 
615         // Java 6: @Override
handleCE(long ce)616         public void handleCE(long ce) {
617         }
618 
619         // Java 6: @Override
handleExpansion(long ces[], int start, int length)620         public void handleExpansion(long ces[], int start, int length) {
621             if (length <= 1) {
622                 // We do not need to add single CEs into the map.
623                 return;
624             }
625             int count = 0; // number of CE "halves"
626             for (int i = 0; i < length; ++i) {
627                 count += ceNeedsTwoParts(ces[start + i]) ? 2 : 1;
628             }
629             // last "half" of the last CE
630             long ce = ces[start + length - 1];
631             long p = ce >>> 32;
632             int lower32 = (int) ce;
633             int lastHalf = getSecondHalf(p, lower32);
634             if (lastHalf == 0) {
635                 lastHalf = getFirstHalf(p, lower32);
636                 assert (lastHalf != 0);
637             } else {
638                 lastHalf |= 0xc0; // old-style continuation CE
639             }
640             Integer oldCount = maxExpansions.get(lastHalf);
641             if (oldCount == null || count > oldCount) {
642                 maxExpansions.put(lastHalf, count);
643             }
644         }
645 
646         private Map<Integer, Integer> maxExpansions;
647     }
648 
computeMaxExpansions(CollationData data)649     static final Map<Integer, Integer> computeMaxExpansions(CollationData data) {
650         Map<Integer, Integer> maxExpansions = new HashMap<Integer, Integer>();
651         MaxExpSink sink = new MaxExpSink(maxExpansions);
652         new ContractionsAndExpansions(null, null, sink, true).forData(data);
653         return maxExpansions;
654     }
655 
656     /**
657      * <p> Returns the maximum length of any expansion sequence that ends with
658      * the specified collation element. If there is no expansion with this
659      * collation element as the last element, returns 1.
660      * </p>
661      * @param ce a collation element returned by previous() or next().
662      * @return the maximum length of any expansion sequence ending
663      *         with the specified collation element.
664      * @stable ICU 2.8
665      */
getMaxExpansion(int ce)666     public int getMaxExpansion(int ce) {
667         return getMaxExpansion(rbc_.tailoring.maxExpansions, ce);
668     }
669 
getMaxExpansion(Map<Integer, Integer> maxExpansions, int order)670     static int getMaxExpansion(Map<Integer, Integer> maxExpansions, int order) {
671         if (order == 0) {
672             return 1;
673         }
674         Integer max;
675         if (maxExpansions != null && (max = maxExpansions.get(order)) != null) {
676             return max;
677         }
678         if ((order & 0xc0) == 0xc0) {
679             // old-style continuation CE
680             return 2;
681         } else {
682             return 1;
683         }
684     }
685 
686     /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
normalizeDir()687     private byte normalizeDir() {
688         return dir_ == 1 ? 0 : dir_;
689     }
690 
691     /**
692      * Tests that argument object is equals to this CollationElementIterator.
693      * Iterators are equal if the objects uses the same RuleBasedCollator,
694      * the same source text and have the same current position in iteration.
695      * @param that object to test if it is equals to this
696      *             CollationElementIterator
697      * @stable ICU 2.8
698      */
equals(Object that)699     public boolean equals(Object that) {
700         if (that == this) {
701             return true;
702         }
703         if (that instanceof CollationElementIterator) {
704             CollationElementIterator thatceiter = (CollationElementIterator) that;
705             return rbc_.equals(thatceiter.rbc_)
706                     && otherHalf_ == thatceiter.otherHalf_
707                     && normalizeDir() == thatceiter.normalizeDir()
708                     && string_.equals(thatceiter.string_)
709                     && iter_.equals(thatceiter.iter_);
710         }
711         return false;
712     }
713 
714     /**
715      * Mock implementation of hashCode(). This implementation always returns a constant
716      * value. When Java assertion is enabled, this method triggers an assertion failure.
717      * @internal
718      * @deprecated This API is ICU internal only.
719      */
720     @Deprecated
hashCode()721     public int hashCode() {
722         assert false : "hashCode not designed";
723         return 42;
724     }
725 
726     /**
727      * @internal
728      * @deprecated This API is ICU internal only.
729      */
730     @Deprecated
getRuleBasedCollator()731     public RuleBasedCollator getRuleBasedCollator() {
732         return rbc_;
733     }
734 }
735