1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2003-2015, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package android.icu.text; 12 13 import java.io.IOException; 14 import java.io.InputStream; 15 import java.lang.ref.WeakReference; 16 import java.nio.ByteBuffer; 17 18 import android.icu.impl.CharTrie; 19 import android.icu.impl.ICUBinary; 20 import android.icu.impl.StringPrepDataReader; 21 import android.icu.impl.UBiDiProps; 22 import android.icu.lang.UCharacter; 23 import android.icu.lang.UCharacterDirection; 24 import android.icu.util.ICUUncheckedIOException; 25 import android.icu.util.VersionInfo; 26 27 /** 28 * StringPrep API implements the StingPrep framework as described by 29 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. 30 * StringPrep prepares Unicode strings for use in network protocols. 31 * Profiles of StingPrep are set of rules and data according to which the 32 * Unicode Strings are prepared. Each profiles contains tables which describe 33 * how a code point should be treated. The tables are broadly classied into 34 * <ul> 35 * <li> Unassigned Table: Contains code points that are unassigned 36 * in the Unicode Version supported by StringPrep. Currently 37 * RFC 3454 supports Unicode 3.2. </li> 38 * <li> Prohibited Table: Contains code points that are prohibted from 39 * the output of the StringPrep processing function. </li> 40 * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li> 41 * </ul> 42 * 43 * The procedure for preparing Unicode strings: 44 * <ol> 45 * <li> Map: For each character in the input, check if it has a mapping 46 * and, if so, replace it with its mapping. </li> 47 * <li> Normalize: Possibly normalize the result of step 1 using Unicode 48 * normalization. </li> 49 * <li> Prohibit: Check for any characters that are not allowed in the 50 * output. If any are found, return an error.</li> 51 * <li> Check bidi: Possibly check for right-to-left characters, and if 52 * any are found, make sure that the whole string satisfies the 53 * requirements for bidirectional strings. If the string does not 54 * satisfy the requirements for bidirectional strings, return an 55 * error. </li> 56 * </ol> 57 * @author Ram Viswanadha 58 * @hide Only a subset of ICU is exposed in Android 59 */ 60 @libcore.api.CorePlatformApi 61 public final class StringPrep { 62 /** 63 * Option to prohibit processing of unassigned code points in the input 64 * 65 * @see #prepare 66 */ 67 @libcore.api.CorePlatformApi 68 public static final int DEFAULT = 0x0000; 69 70 /** 71 * Option to allow processing of unassigned code points in the input 72 * 73 * @see #prepare 74 */ 75 public static final int ALLOW_UNASSIGNED = 0x0001; 76 77 /** 78 * Profile type: RFC3491 Nameprep 79 * @see #getInstance(int) 80 */ 81 public static final int RFC3491_NAMEPREP = 0; 82 83 /** 84 * Profile type: RFC3530 nfs4_cs_prep 85 * @see #getInstance(int) 86 */ 87 public static final int RFC3530_NFS4_CS_PREP = 1; 88 89 /** 90 * Profile type: RFC3530 nfs4_cs_prep with case insensitive option 91 * @see #getInstance(int) 92 */ 93 public static final int RFC3530_NFS4_CS_PREP_CI = 2; 94 95 /** 96 * Profile type: RFC3530 nfs4_cis_prep 97 * @see #getInstance(int) 98 */ 99 public static final int RFC3530_NFS4_CIS_PREP = 3; 100 101 /** 102 * Profile type: RFC3530 nfs4_mixed_prep for prefix 103 * @see #getInstance(int) 104 */ 105 public static final int RFC3530_NFS4_MIXED_PREP_PREFIX = 4; 106 107 /** 108 * Profile type: RFC3530 nfs4_mixed_prep for suffix 109 * @see #getInstance(int) 110 */ 111 public static final int RFC3530_NFS4_MIXED_PREP_SUFFIX = 5; 112 113 /** 114 * Profile type: RFC3722 iSCSI 115 * @see #getInstance(int) 116 */ 117 public static final int RFC3722_ISCSI = 6; 118 119 /** 120 * Profile type: RFC3920 XMPP Nodeprep 121 * @see #getInstance(int) 122 */ 123 public static final int RFC3920_NODEPREP = 7; 124 125 /** 126 * Profile type: RFC3920 XMPP Resourceprep 127 * @see #getInstance(int) 128 */ 129 @libcore.api.CorePlatformApi 130 public static final int RFC3920_RESOURCEPREP = 8; 131 132 /** 133 * Profile type: RFC4011 Policy MIB Stringprep 134 * @see #getInstance(int) 135 */ 136 public static final int RFC4011_MIB = 9; 137 138 /** 139 * Profile type: RFC4013 SASLprep 140 * @see #getInstance(int) 141 */ 142 public static final int RFC4013_SASLPREP = 10; 143 144 /** 145 * Profile type: RFC4505 trace 146 * @see #getInstance(int) 147 */ 148 public static final int RFC4505_TRACE = 11; 149 150 /** 151 * Profile type: RFC4518 LDAP 152 * @see #getInstance(int) 153 */ 154 public static final int RFC4518_LDAP = 12; 155 156 /** 157 * Profile type: RFC4518 LDAP for case ignore, numeric and stored prefix 158 * matching rules 159 * @see #getInstance(int) 160 */ 161 public static final int RFC4518_LDAP_CI = 13; 162 163 // Last available profile 164 private static final int MAX_PROFILE = RFC4518_LDAP_CI; 165 166 // Profile names must be aligned to profile type definitions 167 private static final String[] PROFILE_NAMES = { 168 "rfc3491", /* RFC3491_NAMEPREP */ 169 "rfc3530cs", /* RFC3530_NFS4_CS_PREP */ 170 "rfc3530csci", /* RFC3530_NFS4_CS_PREP_CI */ 171 "rfc3491", /* RFC3530_NSF4_CIS_PREP */ 172 "rfc3530mixp", /* RFC3530_NSF4_MIXED_PREP_PREFIX */ 173 "rfc3491", /* RFC3530_NSF4_MIXED_PREP_SUFFIX */ 174 "rfc3722", /* RFC3722_ISCSI */ 175 "rfc3920node", /* RFC3920_NODEPREP */ 176 "rfc3920res", /* RFC3920_RESOURCEPREP */ 177 "rfc4011", /* RFC4011_MIB */ 178 "rfc4013", /* RFC4013_SASLPREP */ 179 "rfc4505", /* RFC4505_TRACE */ 180 "rfc4518", /* RFC4518_LDAP */ 181 "rfc4518ci", /* RFC4518_LDAP_CI */ 182 }; 183 184 @SuppressWarnings({"unchecked", "rawtypes"}) 185 private static final WeakReference<StringPrep>[] CACHE = (WeakReference<StringPrep>[])new WeakReference[MAX_PROFILE+1]; 186 187 private static final int UNASSIGNED = 0x0000; 188 private static final int MAP = 0x0001; 189 private static final int PROHIBITED = 0x0002; 190 private static final int DELETE = 0x0003; 191 private static final int TYPE_LIMIT = 0x0004; 192 193 private static final int NORMALIZATION_ON = 0x0001; 194 private static final int CHECK_BIDI_ON = 0x0002; 195 196 private static final int TYPE_THRESHOLD = 0xFFF0; 197 private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/ 198 //private static final int MAX_INDEX_TOP_LENGTH = 0x0003; 199 200 /* indexes[] value names */ 201 // private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ 202 private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */ 203 private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */ 204 private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */ 205 private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */ 206 private static final int THREE_UCHARS_MAPPING_INDEX_START = 5; 207 private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6; 208 private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */ 209 private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */ 210 211 212 // CharTrie implmentation for reading the trie data 213 private CharTrie sprepTrie; 214 // Indexes read from the data file 215 private int[] indexes; 216 // mapping data read from the data file 217 private char[] mappingData; 218 // the version of Unicode supported by the data file 219 private VersionInfo sprepUniVer; 220 // the Unicode version of last entry in the 221 // NormalizationCorrections.txt file if normalization 222 // is turned on 223 private VersionInfo normCorrVer; 224 // Option to turn on Normalization 225 private boolean doNFKC; 226 // Option to turn on checking for BiDi rules 227 private boolean checkBiDi; 228 // bidi properties 229 private UBiDiProps bdp; 230 getCodePointValue(int ch)231 private char getCodePointValue(int ch){ 232 return sprepTrie.getCodePointValue(ch); 233 } 234 getVersionInfo(int comp)235 private static VersionInfo getVersionInfo(int comp){ 236 int micro = comp & 0xFF; 237 int milli =(comp >> 8) & 0xFF; 238 int minor =(comp >> 16) & 0xFF; 239 int major =(comp >> 24) & 0xFF; 240 return VersionInfo.getInstance(major,minor,milli,micro); 241 } 242 getVersionInfo(byte[] version)243 private static VersionInfo getVersionInfo(byte[] version){ 244 if(version.length != 4){ 245 return null; 246 } 247 return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]); 248 } 249 250 /** 251 * Creates an StringPrep object after reading the input stream. 252 * The object does not hold a reference to the input steam, so the stream can be 253 * closed after the method returns. 254 * 255 * @param inputStream The stream for reading the StringPrep profile binarySun 256 * @throws IOException An exception occurs when I/O of the inputstream is invalid 257 */ StringPrep(InputStream inputStream)258 public StringPrep(InputStream inputStream) throws IOException{ 259 // TODO: Add a public constructor that takes ByteBuffer directly. 260 this(ICUBinary.getByteBufferFromInputStreamAndCloseStream(inputStream)); 261 } 262 StringPrep(ByteBuffer bytes)263 private StringPrep(ByteBuffer bytes) throws IOException { 264 StringPrepDataReader reader = new StringPrepDataReader(bytes); 265 266 // read the indexes 267 indexes = reader.readIndexes(INDEX_TOP); 268 269 sprepTrie = new CharTrie(bytes, null); 270 271 //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes 272 // load the rest of the data data and initialize the data members 273 mappingData = reader.read(indexes[INDEX_MAPPING_DATA_SIZE]/2); 274 275 // get the options 276 doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); 277 checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); 278 sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); 279 normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); 280 VersionInfo normUniVer = UCharacter.getUnicodeVersion(); 281 if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ 282 normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ 283 ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ 284 ){ 285 throw new IOException("Normalization Correction version not supported"); 286 } 287 288 if(checkBiDi) { 289 bdp=UBiDiProps.INSTANCE; 290 } 291 } 292 293 /** 294 * Gets a StringPrep instance for the specified profile 295 * 296 * @param profile The profile passed to find the StringPrep instance. 297 */ 298 @libcore.api.CorePlatformApi getInstance(int profile)299 public static StringPrep getInstance(int profile) { 300 if (profile < 0 || profile > MAX_PROFILE) { 301 throw new IllegalArgumentException("Bad profile type"); 302 } 303 304 StringPrep instance = null; 305 306 // A StringPrep instance is immutable. We use a single instance 307 // per type and store it in the internal cache. 308 synchronized (CACHE) { 309 WeakReference<StringPrep> ref = CACHE[profile]; 310 if (ref != null) { 311 instance = ref.get(); 312 } 313 314 if (instance == null) { 315 ByteBuffer bytes = ICUBinary.getRequiredData(PROFILE_NAMES[profile] + ".spp"); 316 if (bytes != null) { 317 try { 318 instance = new StringPrep(bytes); 319 } catch (IOException e) { 320 throw new ICUUncheckedIOException(e); 321 } 322 } 323 if (instance != null) { 324 CACHE[profile] = new WeakReference<StringPrep>(instance); 325 } 326 } 327 } 328 return instance; 329 } 330 331 private static final class Values{ 332 boolean isIndex; 333 int value; 334 int type; reset()335 public void reset(){ 336 isIndex = false; 337 value = 0; 338 type = -1; 339 } 340 } 341 getValues(char trieWord,Values values)342 private static final void getValues(char trieWord,Values values){ 343 values.reset(); 344 if(trieWord == 0){ 345 /* 346 * Initial value stored in the mapping table 347 * just return TYPE_LIMIT .. so that 348 * the source codepoint is copied to the destination 349 */ 350 values.type = TYPE_LIMIT; 351 }else if(trieWord >= TYPE_THRESHOLD){ 352 values.type = (trieWord - TYPE_THRESHOLD); 353 }else{ 354 /* get the type */ 355 values.type = MAP; 356 /* ascertain if the value is index or delta */ 357 if((trieWord & 0x02)>0){ 358 values.isIndex = true; 359 values.value = trieWord >> 2; //mask off the lower 2 bits and shift 360 361 }else{ 362 values.isIndex = false; 363 values.value = (trieWord<<16)>>16; 364 values.value = (values.value >> 2); 365 366 } 367 368 if((trieWord>>2) == MAX_INDEX_VALUE){ 369 values.type = DELETE; 370 values.isIndex = false; 371 values.value = 0; 372 } 373 } 374 } 375 376 377 map( UCharacterIterator iter, int options)378 private StringBuffer map( UCharacterIterator iter, int options) 379 throws StringPrepParseException{ 380 381 Values val = new Values(); 382 char result = 0; 383 int ch = UCharacterIterator.DONE; 384 StringBuffer dest = new StringBuffer(); 385 boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0); 386 387 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 388 389 result = getCodePointValue(ch); 390 getValues(result,val); 391 392 // check if the source codepoint is unassigned 393 if(val.type == UNASSIGNED && allowUnassigned == false){ 394 throw new StringPrepParseException("An unassigned code point was found in the input", 395 StringPrepParseException.UNASSIGNED_ERROR, 396 iter.getText(),iter.getIndex()); 397 }else if((val.type == MAP)){ 398 int index, length; 399 400 if(val.isIndex){ 401 index = val.value; 402 if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && 403 index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){ 404 length = 1; 405 }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && 406 index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){ 407 length = 2; 408 }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && 409 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){ 410 length = 3; 411 }else{ 412 length = mappingData[index++]; 413 } 414 /* copy mapping to destination */ 415 dest.append(mappingData,index,length); 416 continue; 417 418 }else{ 419 ch -= val.value; 420 } 421 }else if(val.type == DELETE){ 422 // just consume the codepoint and contine 423 continue; 424 } 425 //copy the source into destination 426 UTF16.append(dest,ch); 427 } 428 429 return dest; 430 } 431 432 normalize(StringBuffer src)433 private StringBuffer normalize(StringBuffer src){ 434 return new StringBuffer( 435 Normalizer.normalize( 436 src.toString(), 437 Normalizer.NFKC, 438 Normalizer.UNICODE_3_2)); 439 } 440 /* 441 boolean isLabelSeparator(int ch){ 442 int result = getCodePointValue(ch); 443 if( (result & 0x07) == LABEL_SEPARATOR){ 444 return true; 445 } 446 return false; 447 } 448 */ 449 /* 450 1) Map -- For each character in the input, check if it has a mapping 451 and, if so, replace it with its mapping. 452 453 2) Normalize -- Possibly normalize the result of step 1 using Unicode 454 normalization. 455 456 3) Prohibit -- Check for any characters that are not allowed in the 457 output. If any are found, return an error. 458 459 4) Check bidi -- Possibly check for right-to-left characters, and if 460 any are found, make sure that the whole string satisfies the 461 requirements for bidirectional strings. If the string does not 462 satisfy the requirements for bidirectional strings, return an 463 error. 464 [Unicode3.2] defines several bidirectional categories; each character 465 has one bidirectional category assigned to it. For the purposes of 466 the requirements below, an "RandALCat character" is a character that 467 has Unicode bidirectional categories "R" or "AL"; an "LCat character" 468 is a character that has Unicode bidirectional category "L". Note 469 470 471 that there are many characters which fall in neither of the above 472 definitions; Latin digits (<U+0030> through <U+0039>) are examples of 473 this because they have bidirectional category "EN". 474 475 In any profile that specifies bidirectional character handling, all 476 three of the following requirements MUST be met: 477 478 1) The characters in section 5.8 MUST be prohibited. 479 480 2) If a string contains any RandALCat character, the string MUST NOT 481 contain any LCat character. 482 483 3) If a string contains any RandALCat character, a RandALCat 484 character MUST be the first character of the string, and a 485 RandALCat character MUST be the last character of the string. 486 */ 487 /** 488 * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), 489 * checks for prohibited and BiDi characters in the order defined by RFC 3454 490 * depending on the options specified in the profile. 491 * 492 * @param src A UCharacterIterator object containing the source string 493 * @param options A bit set of options: 494 * <ul> 495 * <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li> 496 * <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input 497 * as normal Unicode code points.</li> 498 * </ul> 499 * @return StringBuffer A StringBuffer containing the output 500 * @throws StringPrepParseException An exception occurs when parsing a string is invalid. 501 */ prepare(UCharacterIterator src, int options)502 public StringBuffer prepare(UCharacterIterator src, int options) 503 throws StringPrepParseException{ 504 505 // map 506 StringBuffer mapOut = map(src,options); 507 StringBuffer normOut = mapOut;// initialize 508 509 if(doNFKC){ 510 // normalize 511 normOut = normalize(mapOut); 512 } 513 514 int ch; 515 char result; 516 UCharacterIterator iter = UCharacterIterator.getInstance(normOut); 517 Values val = new Values(); 518 int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, 519 firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; 520 int rtlPos=-1, ltrPos=-1; 521 boolean rightToLeft=false, leftToRight=false; 522 523 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 524 result = getCodePointValue(ch); 525 getValues(result,val); 526 527 if(val.type == PROHIBITED ){ 528 throw new StringPrepParseException("A prohibited code point was found in the input", 529 StringPrepParseException.PROHIBITED_ERROR,iter.getText(),val.value); 530 } 531 532 if(checkBiDi) { 533 direction = bdp.getClass(ch); 534 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ 535 firstCharDir = direction; 536 } 537 if(direction == UCharacterDirection.LEFT_TO_RIGHT){ 538 leftToRight = true; 539 ltrPos = iter.getIndex()-1; 540 } 541 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ 542 rightToLeft = true; 543 rtlPos = iter.getIndex()-1; 544 } 545 } 546 } 547 if(checkBiDi == true){ 548 // satisfy 2 549 if( leftToRight == true && rightToLeft == true){ 550 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", 551 StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(), 552 (rtlPos>ltrPos) ? rtlPos : ltrPos); 553 } 554 555 //satisfy 3 556 if( rightToLeft == true && 557 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && 558 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) 559 ){ 560 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", 561 StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(), 562 (rtlPos>ltrPos) ? rtlPos : ltrPos); 563 } 564 } 565 return normOut; 566 567 } 568 569 /** 570 * Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC), 571 * checks for prohibited and BiDi characters in the order defined by RFC 3454 572 * depending on the options specified in the profile. 573 * 574 * @param src A string 575 * @param options A bit set of options: 576 * <ul> 577 * <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li> 578 * <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input 579 * as normal Unicode code points.</li> 580 * </ul> 581 * @return String A String containing the output 582 * @throws StringPrepParseException An exception when parsing or preparing a string is invalid. 583 */ 584 @libcore.api.CorePlatformApi prepare(String src, int options)585 public String prepare(String src, int options) 586 throws StringPrepParseException{ 587 StringBuffer result = prepare(UCharacterIterator.getInstance(src), options); 588 return result.toString(); 589 } 590 } 591