1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 1996, 2020, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 /*
28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29  * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved
30  *
31  *   The original version of this source code and documentation is copyrighted
32  * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
33  * materials are provided under terms of a License Agreement between Taligent
34  * and Sun. This technology is protected by multiple US and International
35  * patents. This notice and attribution to Taligent may not be removed.
36  *   Taligent is a registered trademark of Taligent, Inc.
37  *
38  */
39 
40 package java.text;
41 
42 /**
43  * The {@code CollationElementIterator} class is used as an iterator
44  * to walk through each character of an international string. Use the iterator
45  * to return the ordering priority of the positioned character. The ordering
46  * priority of a character, which we refer to as a key, defines how a character
47  * is collated in the given collation object.
48  *
49  * <p>
50  * For example, consider the following in Spanish:
51  * <blockquote>
52  * <pre>
53  * "ca" &rarr; the first key is key('c') and second key is key('a').
54  * "cha" &rarr; the first key is key('ch') and second key is key('a').
55  * </pre>
56  * </blockquote>
57  * And in German,
58  * <blockquote>
59  * <pre>
60  * "\u00e4b" &rarr; the first key is key('a'), the second key is key('e'), and
61  * the third key is key('b').
62  * </pre>
63  * </blockquote>
64  * The key of a character is an integer composed of primary order(short),
65  * secondary order(byte), and tertiary order(byte). Java strictly defines
66  * the size and signedness of its primitive data types. Therefore, the static
67  * functions {@code primaryOrder}, {@code secondaryOrder}, and
68  * {@code tertiaryOrder} return {@code int}, {@code short},
69  * and {@code short} respectively to ensure the correctness of the key
70  * value.
71  *
72  * <p>
73  * Example of the iterator usage,
74  * <blockquote>
75  * <pre>
76  *
77  *  String testString = "This is a test";
78  *  Collator col = Collator.getInstance();
79  *  if (col instanceof RuleBasedCollator) {
80  *      RuleBasedCollator ruleBasedCollator = (RuleBasedCollator)col;
81  *      CollationElementIterator collationElementIterator = ruleBasedCollator.getCollationElementIterator(testString);
82  *      int primaryOrder = CollationElementIterator.primaryOrder(collationElementIterator.next());
83  *          :
84  *  }
85  * </pre>
86  * </blockquote>
87  *
88  * <p>
89  * {@code CollationElementIterator.next} returns the collation order
90  * of the next character. A collation order consists of primary order,
91  * secondary order and tertiary order. The data type of the collation
92  * order is <strong>int</strong>. The first 16 bits of a collation order
93  * is its primary order; the next 8 bits is the secondary order and the
94  * last 8 bits is the tertiary order.
95  *
96  * <p><b>Note:</b> {@code CollationElementIterator} is a part of
97  * {@code RuleBasedCollator} implementation. It is only usable
98  * with {@code RuleBasedCollator} instances.
99  *
100  * @see                Collator
101  * @see                RuleBasedCollator
102  * @author             Helena Shih, Laura Werner, Richard Gillam
103  * @since 1.1
104  */
105 public final class CollationElementIterator
106 {
107     /**
108      * Null order which indicates the end of string is reached by the
109      * cursor.
110      */
111     public static final int NULLORDER = 0xffffffff;
112 
113     // BEGIN Android-removed: internal constructors.
114     /*
115      * CollationElementIterator constructor.  This takes the source string and
116      * the collation object.  The cursor will walk thru the source string based
117      * on the predefined collation rules.  If the source string is empty,
118      * NULLORDER will be returned on the calls to next().
119      * @param sourceText the source string.
120      * @param owner the collation object.
121      *
122     CollationElementIterator(String sourceText, RuleBasedCollator owner) {
123         this.owner = owner;
124         ordering = owner.getTables();
125         if (!sourceText.isEmpty()) {
126             NormalizerBase.Mode mode =
127                 CollatorUtilities.toNormalizerMode(owner.getDecomposition());
128             text = new NormalizerBase(sourceText, mode);
129         }
130     }
131     */
132     // END Android-removed: internal constructors.
133 
134     // Android-added: ICU iterator to delegate to.
135     private android.icu.text.CollationElementIterator icuIterator;
136 
137    // Android-added: internal constructor taking an ICU CollationElementIterator.
CollationElementIterator(android.icu.text.CollationElementIterator iterator)138     CollationElementIterator(android.icu.text.CollationElementIterator iterator) {
139         icuIterator = iterator;
140     }
141 
142     /**
143      * Resets the cursor to the beginning of the string.  The next call
144      * to next() will return the first collation element in the string.
145      */
reset()146     public void reset()
147     {
148         // Android-changed: delegate to ICU CollationElementIterator.
149         icuIterator.reset();
150     }
151 
152     /**
153      * Get the next collation element in the string.  <p>This iterator iterates
154      * over a sequence of collation elements that were built from the string.
155      * Because there isn't necessarily a one-to-one mapping from characters to
156      * collation elements, this doesn't mean the same thing as "return the
157      * collation element [or ordering priority] of the next character in the
158      * string".</p>
159      * <p>This function returns the collation element that the iterator is currently
160      * pointing to and then updates the internal pointer to point to the next element.
161      * previous() updates the pointer first and then returns the element.  This
162      * means that when you change direction while iterating (i.e., call next() and
163      * then call previous(), or call previous() and then call next()), you'll get
164      * back the same element twice.</p>
165      *
166      * @return the next collation element
167      */
next()168     public int next()
169     {
170         // Android-changed: delegate to ICU CollationElementIterator.
171         return icuIterator.next();
172     }
173 
174     /**
175      * Get the previous collation element in the string.  <p>This iterator iterates
176      * over a sequence of collation elements that were built from the string.
177      * Because there isn't necessarily a one-to-one mapping from characters to
178      * collation elements, this doesn't mean the same thing as "return the
179      * collation element [or ordering priority] of the previous character in the
180      * string".</p>
181      * <p>This function updates the iterator's internal pointer to point to the
182      * collation element preceding the one it's currently pointing to and then
183      * returns that element, while next() returns the current element and then
184      * updates the pointer.  This means that when you change direction while
185      * iterating (i.e., call next() and then call previous(), or call previous()
186      * and then call next()), you'll get back the same element twice.</p>
187      *
188      * @return the previous collation element
189      * @since 1.2
190      */
previous()191     public int previous()
192     {
193         // Android-changed: delegate to ICU CollationElementIterator.
194         return icuIterator.previous();
195     }
196 
197     /**
198      * Return the primary component of a collation element.
199      * @param order the collation element
200      * @return the element's primary component
201      */
primaryOrder(int order)202     public static final int primaryOrder(int order)
203     {
204         // Android-changed: delegate to ICU CollationElementIterator.
205         return android.icu.text.CollationElementIterator.primaryOrder(order);
206     }
207     /**
208      * Return the secondary component of a collation element.
209      * @param order the collation element
210      * @return the element's secondary component
211      */
secondaryOrder(int order)212     public static final short secondaryOrder(int order)
213     {
214         // Android-changed: delegate to ICU CollationElementIterator.
215        return (short) android.icu.text.CollationElementIterator.secondaryOrder(order);
216     }
217     /**
218      * Return the tertiary component of a collation element.
219      * @param order the collation element
220      * @return the element's tertiary component
221      */
tertiaryOrder(int order)222     public static final short tertiaryOrder(int order)
223     {
224         // Android-changed: delegate to ICU CollationElementIterator.
225         return (short) android.icu.text.CollationElementIterator.tertiaryOrder(order);
226     }
227 
228     /**
229      * Sets the iterator to point to the collation element corresponding to
230      * the specified character (the parameter is a CHARACTER offset in the
231      * original string, not an offset into its corresponding sequence of
232      * collation elements).  The value returned by the next call to next()
233      * will be the collation element corresponding to the specified position
234      * in the text.  If that position is in the middle of a contracting
235      * character sequence, the result of the next call to next() is the
236      * collation element for that sequence.  This means that getOffset()
237      * is not guaranteed to return the same value as was passed to a preceding
238      * call to setOffset().
239      *
240      * @param newOffset The new character offset into the original text.
241      * @since 1.2
242      */
243     @SuppressWarnings("deprecation") // getBeginIndex, getEndIndex and setIndex are deprecated
setOffset(int newOffset)244     public void setOffset(int newOffset)
245     {
246         // Android-changed: delegate to ICU CollationElementIterator.
247         icuIterator.setOffset(newOffset);
248     }
249 
250     /**
251      * Returns the character offset in the original text corresponding to the next
252      * collation element.  (That is, getOffset() returns the position in the text
253      * corresponding to the collation element that will be returned by the next
254      * call to next().)  This value will always be the index of the FIRST character
255      * corresponding to the collation element (a contracting character sequence is
256      * when two or more characters all correspond to the same collation element).
257      * This means if you do setOffset(x) followed immediately by getOffset(), getOffset()
258      * won't necessarily return x.
259      *
260      * @return The character offset in the original text corresponding to the collation
261      * element that will be returned by the next call to next().
262      * @since 1.2
263      */
getOffset()264     public int getOffset()
265     {
266         // Android-changed: delegate to ICU CollationElementIterator.
267         return icuIterator.getOffset();
268     }
269 
270 
271     /**
272      * Return the maximum length of any expansion sequences that end
273      * with the specified comparison order.
274      * @param order a collation order returned by previous or next.
275      * @return the maximum length of any expansion sequences ending
276      *         with the specified order.
277      * @since 1.2
278      */
getMaxExpansion(int order)279     public int getMaxExpansion(int order)
280     {
281         // Android-changed: delegate to ICU CollationElementIterator.
282         return icuIterator.getMaxExpansion(order);
283     }
284 
285     /**
286      * Set a new string over which to iterate.
287      *
288      * @param source  the new source text
289      * @since 1.2
290      */
setText(String source)291     public void setText(String source)
292     {
293         // Android-changed: delegate to ICU CollationElementIterator.
294         icuIterator.setText(source);
295     }
296 
297     /**
298      * Set a new string over which to iterate.
299      *
300      * @param source  the new source text.
301      * @since 1.2
302      */
setText(CharacterIterator source)303     public void setText(CharacterIterator source)
304     {
305         // Android-changed: delegate to ICU CollationElementIterator.
306         icuIterator.setText(source);
307     }
308 
309     // BEGIN Android-removed: private helper methods and fields.
310     /*
311     //============================================================
312     // privates
313     //============================================================
314 
315     /**
316      * Determine if a character is a Thai vowel (which sorts after
317      * its base consonant).
318      *
319     private static final boolean isThaiPreVowel(int ch) {
320         return (ch >= 0x0e40) && (ch <= 0x0e44);
321     }
322 
323     /**
324      * Determine if a character is a Thai base consonant
325      *
326     private static final boolean isThaiBaseConsonant(int ch) {
327         return (ch >= 0x0e01) && (ch <= 0x0e2e);
328     }
329 
330     /**
331      * Determine if a character is a Lao vowel (which sorts after
332      * its base consonant).
333      *
334     private static final boolean isLaoPreVowel(int ch) {
335         return (ch >= 0x0ec0) && (ch <= 0x0ec4);
336     }
337 
338     /**
339      * Determine if a character is a Lao base consonant
340      *
341     private static final boolean isLaoBaseConsonant(int ch) {
342         return (ch >= 0x0e81) && (ch <= 0x0eae);
343     }
344 
345     /**
346      * This method produces a buffer which contains the collation
347      * elements for the two characters, with colFirst's values preceding
348      * another character's.  Presumably, the other character precedes colFirst
349      * in logical order (otherwise you wouldn't need this method would you?).
350      * The assumption is that the other char's value(s) have already been
351      * computed.  If this char has a single element it is passed to this
352      * method as lastValue, and lastExpansion is null.  If it has an
353      * expansion it is passed in lastExpansion, and colLastValue is ignored.
354      *
355     private int[] makeReorderedBuffer(int colFirst,
356                                       int lastValue,
357                                       int[] lastExpansion,
358                                       boolean forward) {
359 
360         int[] result;
361 
362         int firstValue = ordering.getUnicodeOrder(colFirst);
363         if (firstValue >= RuleBasedCollator.CONTRACTCHARINDEX) {
364             firstValue = forward? nextContractChar(colFirst) : prevContractChar(colFirst);
365         }
366 
367         int[] firstExpansion = null;
368         if (firstValue >= RuleBasedCollator.EXPANDCHARINDEX) {
369             firstExpansion = ordering.getExpandValueList(firstValue);
370         }
371 
372         if (!forward) {
373             int temp1 = firstValue;
374             firstValue = lastValue;
375             lastValue = temp1;
376             int[] temp2 = firstExpansion;
377             firstExpansion = lastExpansion;
378             lastExpansion = temp2;
379         }
380 
381         if (firstExpansion == null && lastExpansion == null) {
382             result = new int [2];
383             result[0] = firstValue;
384             result[1] = lastValue;
385         }
386         else {
387             int firstLength = firstExpansion==null? 1 : firstExpansion.length;
388             int lastLength = lastExpansion==null? 1 : lastExpansion.length;
389             result = new int[firstLength + lastLength];
390 
391             if (firstExpansion == null) {
392                 result[0] = firstValue;
393             }
394             else {
395                 System.arraycopy(firstExpansion, 0, result, 0, firstLength);
396             }
397 
398             if (lastExpansion == null) {
399                 result[firstLength] = lastValue;
400             }
401             else {
402                 System.arraycopy(lastExpansion, 0, result, firstLength, lastLength);
403             }
404         }
405 
406         return result;
407     }
408 
409     /**
410      *  Check if a comparison order is ignorable.
411      *  @return true if a character is ignorable, false otherwise.
412      *
413     static final boolean isIgnorable(int order)
414     {
415         return ((primaryOrder(order) == 0) ? true : false);
416     }
417 
418     /**
419      * Get the ordering priority of the next contracting character in the
420      * string.
421      * @param ch the starting character of a contracting character token
422      * @return the next contracting character's ordering.  Returns NULLORDER
423      * if the end of string is reached.
424      *
425     private int nextContractChar(int ch)
426     {
427         // First get the ordering of this single character,
428         // which is always the first element in the list
429         Vector<EntryPair> list = ordering.getContractValues(ch);
430         EntryPair pair = list.firstElement();
431         int order = pair.value;
432 
433         // find out the length of the longest contracting character sequence in the list.
434         // There's logic in the builder code to make sure the longest sequence is always
435         // the last.
436         pair = list.lastElement();
437         int maxLength = pair.entryName.length();
438 
439         // (the Normalizer is cloned here so that the seeking we do in the next loop
440         // won't affect our real position in the text)
441         NormalizerBase tempText = (NormalizerBase)text.clone();
442 
443         // extract the next maxLength characters in the string (we have to do this using the
444         // Normalizer to ensure that our offsets correspond to those the rest of the
445         // iterator is using) and store it in "fragment".
446         tempText.previous();
447         key.setLength(0);
448         int c = tempText.next();
449         while (maxLength > 0 && c != NormalizerBase.DONE) {
450             if (Character.isSupplementaryCodePoint(c)) {
451                 key.append(Character.toChars(c));
452                 maxLength -= 2;
453             } else {
454                 key.append((char)c);
455                 --maxLength;
456             }
457             c = tempText.next();
458         }
459         String fragment = key.toString();
460         // now that we have that fragment, iterate through this list looking for the
461         // longest sequence that matches the characters in the actual text.  (maxLength
462         // is used here to keep track of the length of the longest sequence)
463         // Upon exit from this loop, maxLength will contain the length of the matching
464         // sequence and order will contain the collation-element value corresponding
465         // to this sequence
466         maxLength = 1;
467         for (int i = list.size() - 1; i > 0; i--) {
468             pair = list.elementAt(i);
469             if (!pair.fwd)
470                 continue;
471 
472             if (fragment.startsWith(pair.entryName) && pair.entryName.length()
473                     > maxLength) {
474                 maxLength = pair.entryName.length();
475                 order = pair.value;
476             }
477         }
478 
479         // seek our current iteration position to the end of the matching sequence
480         // and return the appropriate collation-element value (if there was no matching
481         // sequence, we're already seeked to the right position and order already contains
482         // the correct collation-element value for the single character)
483         while (maxLength > 1) {
484             c = text.next();
485             maxLength -= Character.charCount(c);
486         }
487         return order;
488     }
489 
490     /**
491      * Get the ordering priority of the previous contracting character in the
492      * string.
493      * @param ch the starting character of a contracting character token
494      * @return the next contracting character's ordering.  Returns NULLORDER
495      * if the end of string is reached.
496      *
497     private int prevContractChar(int ch)
498     {
499         // This function is identical to nextContractChar(), except that we've
500         // switched things so that the next() and previous() calls on the Normalizer
501         // are switched and so that we skip entry pairs with the fwd flag turned on
502         // rather than off.  Notice that we still use append() and startsWith() when
503         // working on the fragment.  This is because the entry pairs that are used
504         // in reverse iteration have their names reversed already.
505         Vector<EntryPair> list = ordering.getContractValues(ch);
506         EntryPair pair = list.firstElement();
507         int order = pair.value;
508 
509         pair = list.lastElement();
510         int maxLength = pair.entryName.length();
511 
512         NormalizerBase tempText = (NormalizerBase)text.clone();
513 
514         tempText.next();
515         key.setLength(0);
516         int c = tempText.previous();
517         while (maxLength > 0 && c != NormalizerBase.DONE) {
518             if (Character.isSupplementaryCodePoint(c)) {
519                 key.append(Character.toChars(c));
520                 maxLength -= 2;
521             } else {
522                 key.append((char)c);
523                 --maxLength;
524             }
525             c = tempText.previous();
526         }
527         String fragment = key.toString();
528 
529         maxLength = 1;
530         for (int i = list.size() - 1; i > 0; i--) {
531             pair = list.elementAt(i);
532             if (pair.fwd)
533                 continue;
534 
535             if (fragment.startsWith(pair.entryName) && pair.entryName.length()
536                     > maxLength) {
537                 maxLength = pair.entryName.length();
538                 order = pair.value;
539             }
540         }
541 
542         while (maxLength > 1) {
543             c = text.previous();
544             maxLength -= Character.charCount(c);
545         }
546         return order;
547     }
548 
549     static final int UNMAPPEDCHARVALUE = 0x7FFF0000;
550 
551     private NormalizerBase text = null;
552     private int[] buffer = null;
553     private int expIndex = 0;
554     private StringBuffer key = new StringBuffer(5);
555     private int swapOrder = 0;
556     private RBCollationTables ordering;
557     private RuleBasedCollator owner;
558     */
559     // END Android-removed: private helper methods and fields.
560 }
561