1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * Copyright (c) 1996, 2020, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 /* 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved 30 * 31 * The original version of this source code and documentation is copyrighted 32 * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These 33 * materials are provided under terms of a License Agreement between Taligent 34 * and Sun. This technology is protected by multiple US and International 35 * patents. This notice and attribution to Taligent may not be removed. 36 * Taligent is a registered trademark of Taligent, Inc. 37 * 38 */ 39 40 package java.text; 41 42 /** 43 * The {@code CollationElementIterator} class is used as an iterator 44 * to walk through each character of an international string. Use the iterator 45 * to return the ordering priority of the positioned character. The ordering 46 * priority of a character, which we refer to as a key, defines how a character 47 * is collated in the given collation object. 48 * 49 * <p> 50 * For example, consider the following in Spanish: 51 * <blockquote> 52 * <pre> 53 * "ca" → the first key is key('c') and second key is key('a'). 54 * "cha" → the first key is key('ch') and second key is key('a'). 55 * </pre> 56 * </blockquote> 57 * And in German, 58 * <blockquote> 59 * <pre> 60 * "\u00e4b" → the first key is key('a'), the second key is key('e'), and 61 * the third key is key('b'). 62 * </pre> 63 * </blockquote> 64 * The key of a character is an integer composed of primary order(short), 65 * secondary order(byte), and tertiary order(byte). Java strictly defines 66 * the size and signedness of its primitive data types. Therefore, the static 67 * functions {@code primaryOrder}, {@code secondaryOrder}, and 68 * {@code tertiaryOrder} return {@code int}, {@code short}, 69 * and {@code short} respectively to ensure the correctness of the key 70 * value. 71 * 72 * <p> 73 * Example of the iterator usage, 74 * <blockquote> 75 * <pre> 76 * 77 * String testString = "This is a test"; 78 * Collator col = Collator.getInstance(); 79 * if (col instanceof RuleBasedCollator) { 80 * RuleBasedCollator ruleBasedCollator = (RuleBasedCollator)col; 81 * CollationElementIterator collationElementIterator = ruleBasedCollator.getCollationElementIterator(testString); 82 * int primaryOrder = CollationElementIterator.primaryOrder(collationElementIterator.next()); 83 * : 84 * } 85 * </pre> 86 * </blockquote> 87 * 88 * <p> 89 * {@code CollationElementIterator.next} returns the collation order 90 * of the next character. A collation order consists of primary order, 91 * secondary order and tertiary order. The data type of the collation 92 * order is <strong>int</strong>. The first 16 bits of a collation order 93 * is its primary order; the next 8 bits is the secondary order and the 94 * last 8 bits is the tertiary order. 95 * 96 * <p><b>Note:</b> {@code CollationElementIterator} is a part of 97 * {@code RuleBasedCollator} implementation. It is only usable 98 * with {@code RuleBasedCollator} instances. 99 * 100 * @see Collator 101 * @see RuleBasedCollator 102 * @author Helena Shih, Laura Werner, Richard Gillam 103 * @since 1.1 104 */ 105 public final class CollationElementIterator 106 { 107 /** 108 * Null order which indicates the end of string is reached by the 109 * cursor. 110 */ 111 public static final int NULLORDER = 0xffffffff; 112 113 // BEGIN Android-removed: internal constructors. 114 /* 115 * CollationElementIterator constructor. This takes the source string and 116 * the collation object. The cursor will walk thru the source string based 117 * on the predefined collation rules. If the source string is empty, 118 * NULLORDER will be returned on the calls to next(). 119 * @param sourceText the source string. 120 * @param owner the collation object. 121 * 122 CollationElementIterator(String sourceText, RuleBasedCollator owner) { 123 this.owner = owner; 124 ordering = owner.getTables(); 125 if (!sourceText.isEmpty()) { 126 NormalizerBase.Mode mode = 127 CollatorUtilities.toNormalizerMode(owner.getDecomposition()); 128 text = new NormalizerBase(sourceText, mode); 129 } 130 } 131 */ 132 // END Android-removed: internal constructors. 133 134 // Android-added: ICU iterator to delegate to. 135 private android.icu.text.CollationElementIterator icuIterator; 136 137 // Android-added: internal constructor taking an ICU CollationElementIterator. CollationElementIterator(android.icu.text.CollationElementIterator iterator)138 CollationElementIterator(android.icu.text.CollationElementIterator iterator) { 139 icuIterator = iterator; 140 } 141 142 /** 143 * Resets the cursor to the beginning of the string. The next call 144 * to next() will return the first collation element in the string. 145 */ reset()146 public void reset() 147 { 148 // Android-changed: delegate to ICU CollationElementIterator. 149 icuIterator.reset(); 150 } 151 152 /** 153 * Get the next collation element in the string. <p>This iterator iterates 154 * over a sequence of collation elements that were built from the string. 155 * Because there isn't necessarily a one-to-one mapping from characters to 156 * collation elements, this doesn't mean the same thing as "return the 157 * collation element [or ordering priority] of the next character in the 158 * string".</p> 159 * <p>This function returns the collation element that the iterator is currently 160 * pointing to and then updates the internal pointer to point to the next element. 161 * previous() updates the pointer first and then returns the element. This 162 * means that when you change direction while iterating (i.e., call next() and 163 * then call previous(), or call previous() and then call next()), you'll get 164 * back the same element twice.</p> 165 * 166 * @return the next collation element 167 */ next()168 public int next() 169 { 170 // Android-changed: delegate to ICU CollationElementIterator. 171 return icuIterator.next(); 172 } 173 174 /** 175 * Get the previous collation element in the string. <p>This iterator iterates 176 * over a sequence of collation elements that were built from the string. 177 * Because there isn't necessarily a one-to-one mapping from characters to 178 * collation elements, this doesn't mean the same thing as "return the 179 * collation element [or ordering priority] of the previous character in the 180 * string".</p> 181 * <p>This function updates the iterator's internal pointer to point to the 182 * collation element preceding the one it's currently pointing to and then 183 * returns that element, while next() returns the current element and then 184 * updates the pointer. This means that when you change direction while 185 * iterating (i.e., call next() and then call previous(), or call previous() 186 * and then call next()), you'll get back the same element twice.</p> 187 * 188 * @return the previous collation element 189 * @since 1.2 190 */ previous()191 public int previous() 192 { 193 // Android-changed: delegate to ICU CollationElementIterator. 194 return icuIterator.previous(); 195 } 196 197 /** 198 * Return the primary component of a collation element. 199 * @param order the collation element 200 * @return the element's primary component 201 */ primaryOrder(int order)202 public static final int primaryOrder(int order) 203 { 204 // Android-changed: delegate to ICU CollationElementIterator. 205 return android.icu.text.CollationElementIterator.primaryOrder(order); 206 } 207 /** 208 * Return the secondary component of a collation element. 209 * @param order the collation element 210 * @return the element's secondary component 211 */ secondaryOrder(int order)212 public static final short secondaryOrder(int order) 213 { 214 // Android-changed: delegate to ICU CollationElementIterator. 215 return (short) android.icu.text.CollationElementIterator.secondaryOrder(order); 216 } 217 /** 218 * Return the tertiary component of a collation element. 219 * @param order the collation element 220 * @return the element's tertiary component 221 */ tertiaryOrder(int order)222 public static final short tertiaryOrder(int order) 223 { 224 // Android-changed: delegate to ICU CollationElementIterator. 225 return (short) android.icu.text.CollationElementIterator.tertiaryOrder(order); 226 } 227 228 /** 229 * Sets the iterator to point to the collation element corresponding to 230 * the specified character (the parameter is a CHARACTER offset in the 231 * original string, not an offset into its corresponding sequence of 232 * collation elements). The value returned by the next call to next() 233 * will be the collation element corresponding to the specified position 234 * in the text. If that position is in the middle of a contracting 235 * character sequence, the result of the next call to next() is the 236 * collation element for that sequence. This means that getOffset() 237 * is not guaranteed to return the same value as was passed to a preceding 238 * call to setOffset(). 239 * 240 * @param newOffset The new character offset into the original text. 241 * @since 1.2 242 */ 243 @SuppressWarnings("deprecation") // getBeginIndex, getEndIndex and setIndex are deprecated setOffset(int newOffset)244 public void setOffset(int newOffset) 245 { 246 // Android-changed: delegate to ICU CollationElementIterator. 247 icuIterator.setOffset(newOffset); 248 } 249 250 /** 251 * Returns the character offset in the original text corresponding to the next 252 * collation element. (That is, getOffset() returns the position in the text 253 * corresponding to the collation element that will be returned by the next 254 * call to next().) This value will always be the index of the FIRST character 255 * corresponding to the collation element (a contracting character sequence is 256 * when two or more characters all correspond to the same collation element). 257 * This means if you do setOffset(x) followed immediately by getOffset(), getOffset() 258 * won't necessarily return x. 259 * 260 * @return The character offset in the original text corresponding to the collation 261 * element that will be returned by the next call to next(). 262 * @since 1.2 263 */ getOffset()264 public int getOffset() 265 { 266 // Android-changed: delegate to ICU CollationElementIterator. 267 return icuIterator.getOffset(); 268 } 269 270 271 /** 272 * Return the maximum length of any expansion sequences that end 273 * with the specified comparison order. 274 * @param order a collation order returned by previous or next. 275 * @return the maximum length of any expansion sequences ending 276 * with the specified order. 277 * @since 1.2 278 */ getMaxExpansion(int order)279 public int getMaxExpansion(int order) 280 { 281 // Android-changed: delegate to ICU CollationElementIterator. 282 return icuIterator.getMaxExpansion(order); 283 } 284 285 /** 286 * Set a new string over which to iterate. 287 * 288 * @param source the new source text 289 * @since 1.2 290 */ setText(String source)291 public void setText(String source) 292 { 293 // Android-changed: delegate to ICU CollationElementIterator. 294 icuIterator.setText(source); 295 } 296 297 /** 298 * Set a new string over which to iterate. 299 * 300 * @param source the new source text. 301 * @since 1.2 302 */ setText(CharacterIterator source)303 public void setText(CharacterIterator source) 304 { 305 // Android-changed: delegate to ICU CollationElementIterator. 306 icuIterator.setText(source); 307 } 308 309 // BEGIN Android-removed: private helper methods and fields. 310 /* 311 //============================================================ 312 // privates 313 //============================================================ 314 315 /** 316 * Determine if a character is a Thai vowel (which sorts after 317 * its base consonant). 318 * 319 private static final boolean isThaiPreVowel(int ch) { 320 return (ch >= 0x0e40) && (ch <= 0x0e44); 321 } 322 323 /** 324 * Determine if a character is a Thai base consonant 325 * 326 private static final boolean isThaiBaseConsonant(int ch) { 327 return (ch >= 0x0e01) && (ch <= 0x0e2e); 328 } 329 330 /** 331 * Determine if a character is a Lao vowel (which sorts after 332 * its base consonant). 333 * 334 private static final boolean isLaoPreVowel(int ch) { 335 return (ch >= 0x0ec0) && (ch <= 0x0ec4); 336 } 337 338 /** 339 * Determine if a character is a Lao base consonant 340 * 341 private static final boolean isLaoBaseConsonant(int ch) { 342 return (ch >= 0x0e81) && (ch <= 0x0eae); 343 } 344 345 /** 346 * This method produces a buffer which contains the collation 347 * elements for the two characters, with colFirst's values preceding 348 * another character's. Presumably, the other character precedes colFirst 349 * in logical order (otherwise you wouldn't need this method would you?). 350 * The assumption is that the other char's value(s) have already been 351 * computed. If this char has a single element it is passed to this 352 * method as lastValue, and lastExpansion is null. If it has an 353 * expansion it is passed in lastExpansion, and colLastValue is ignored. 354 * 355 private int[] makeReorderedBuffer(int colFirst, 356 int lastValue, 357 int[] lastExpansion, 358 boolean forward) { 359 360 int[] result; 361 362 int firstValue = ordering.getUnicodeOrder(colFirst); 363 if (firstValue >= RuleBasedCollator.CONTRACTCHARINDEX) { 364 firstValue = forward? nextContractChar(colFirst) : prevContractChar(colFirst); 365 } 366 367 int[] firstExpansion = null; 368 if (firstValue >= RuleBasedCollator.EXPANDCHARINDEX) { 369 firstExpansion = ordering.getExpandValueList(firstValue); 370 } 371 372 if (!forward) { 373 int temp1 = firstValue; 374 firstValue = lastValue; 375 lastValue = temp1; 376 int[] temp2 = firstExpansion; 377 firstExpansion = lastExpansion; 378 lastExpansion = temp2; 379 } 380 381 if (firstExpansion == null && lastExpansion == null) { 382 result = new int [2]; 383 result[0] = firstValue; 384 result[1] = lastValue; 385 } 386 else { 387 int firstLength = firstExpansion==null? 1 : firstExpansion.length; 388 int lastLength = lastExpansion==null? 1 : lastExpansion.length; 389 result = new int[firstLength + lastLength]; 390 391 if (firstExpansion == null) { 392 result[0] = firstValue; 393 } 394 else { 395 System.arraycopy(firstExpansion, 0, result, 0, firstLength); 396 } 397 398 if (lastExpansion == null) { 399 result[firstLength] = lastValue; 400 } 401 else { 402 System.arraycopy(lastExpansion, 0, result, firstLength, lastLength); 403 } 404 } 405 406 return result; 407 } 408 409 /** 410 * Check if a comparison order is ignorable. 411 * @return true if a character is ignorable, false otherwise. 412 * 413 static final boolean isIgnorable(int order) 414 { 415 return ((primaryOrder(order) == 0) ? true : false); 416 } 417 418 /** 419 * Get the ordering priority of the next contracting character in the 420 * string. 421 * @param ch the starting character of a contracting character token 422 * @return the next contracting character's ordering. Returns NULLORDER 423 * if the end of string is reached. 424 * 425 private int nextContractChar(int ch) 426 { 427 // First get the ordering of this single character, 428 // which is always the first element in the list 429 Vector<EntryPair> list = ordering.getContractValues(ch); 430 EntryPair pair = list.firstElement(); 431 int order = pair.value; 432 433 // find out the length of the longest contracting character sequence in the list. 434 // There's logic in the builder code to make sure the longest sequence is always 435 // the last. 436 pair = list.lastElement(); 437 int maxLength = pair.entryName.length(); 438 439 // (the Normalizer is cloned here so that the seeking we do in the next loop 440 // won't affect our real position in the text) 441 NormalizerBase tempText = (NormalizerBase)text.clone(); 442 443 // extract the next maxLength characters in the string (we have to do this using the 444 // Normalizer to ensure that our offsets correspond to those the rest of the 445 // iterator is using) and store it in "fragment". 446 tempText.previous(); 447 key.setLength(0); 448 int c = tempText.next(); 449 while (maxLength > 0 && c != NormalizerBase.DONE) { 450 if (Character.isSupplementaryCodePoint(c)) { 451 key.append(Character.toChars(c)); 452 maxLength -= 2; 453 } else { 454 key.append((char)c); 455 --maxLength; 456 } 457 c = tempText.next(); 458 } 459 String fragment = key.toString(); 460 // now that we have that fragment, iterate through this list looking for the 461 // longest sequence that matches the characters in the actual text. (maxLength 462 // is used here to keep track of the length of the longest sequence) 463 // Upon exit from this loop, maxLength will contain the length of the matching 464 // sequence and order will contain the collation-element value corresponding 465 // to this sequence 466 maxLength = 1; 467 for (int i = list.size() - 1; i > 0; i--) { 468 pair = list.elementAt(i); 469 if (!pair.fwd) 470 continue; 471 472 if (fragment.startsWith(pair.entryName) && pair.entryName.length() 473 > maxLength) { 474 maxLength = pair.entryName.length(); 475 order = pair.value; 476 } 477 } 478 479 // seek our current iteration position to the end of the matching sequence 480 // and return the appropriate collation-element value (if there was no matching 481 // sequence, we're already seeked to the right position and order already contains 482 // the correct collation-element value for the single character) 483 while (maxLength > 1) { 484 c = text.next(); 485 maxLength -= Character.charCount(c); 486 } 487 return order; 488 } 489 490 /** 491 * Get the ordering priority of the previous contracting character in the 492 * string. 493 * @param ch the starting character of a contracting character token 494 * @return the next contracting character's ordering. Returns NULLORDER 495 * if the end of string is reached. 496 * 497 private int prevContractChar(int ch) 498 { 499 // This function is identical to nextContractChar(), except that we've 500 // switched things so that the next() and previous() calls on the Normalizer 501 // are switched and so that we skip entry pairs with the fwd flag turned on 502 // rather than off. Notice that we still use append() and startsWith() when 503 // working on the fragment. This is because the entry pairs that are used 504 // in reverse iteration have their names reversed already. 505 Vector<EntryPair> list = ordering.getContractValues(ch); 506 EntryPair pair = list.firstElement(); 507 int order = pair.value; 508 509 pair = list.lastElement(); 510 int maxLength = pair.entryName.length(); 511 512 NormalizerBase tempText = (NormalizerBase)text.clone(); 513 514 tempText.next(); 515 key.setLength(0); 516 int c = tempText.previous(); 517 while (maxLength > 0 && c != NormalizerBase.DONE) { 518 if (Character.isSupplementaryCodePoint(c)) { 519 key.append(Character.toChars(c)); 520 maxLength -= 2; 521 } else { 522 key.append((char)c); 523 --maxLength; 524 } 525 c = tempText.previous(); 526 } 527 String fragment = key.toString(); 528 529 maxLength = 1; 530 for (int i = list.size() - 1; i > 0; i--) { 531 pair = list.elementAt(i); 532 if (pair.fwd) 533 continue; 534 535 if (fragment.startsWith(pair.entryName) && pair.entryName.length() 536 > maxLength) { 537 maxLength = pair.entryName.length(); 538 order = pair.value; 539 } 540 } 541 542 while (maxLength > 1) { 543 c = text.previous(); 544 maxLength -= Character.charCount(c); 545 } 546 return order; 547 } 548 549 static final int UNMAPPEDCHARVALUE = 0x7FFF0000; 550 551 private NormalizerBase text = null; 552 private int[] buffer = null; 553 private int expIndex = 0; 554 private StringBuffer key = new StringBuffer(5); 555 private int swapOrder = 0; 556 private RBCollationTables ordering; 557 private RuleBasedCollator owner; 558 */ 559 // END Android-removed: private helper methods and fields. 560 } 561