1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package java.nio.charset; 19 20 import java.io.UnsupportedEncodingException; 21 import java.nio.ByteBuffer; 22 import java.nio.CharBuffer; 23 import java.nio.charset.spi.CharsetProvider; 24 import java.util.Collections; 25 import java.util.HashMap; 26 import java.util.HashSet; 27 import java.util.Iterator; 28 import java.util.Locale; 29 import java.util.ServiceLoader; 30 import java.util.Set; 31 import java.util.SortedMap; 32 import java.util.TreeMap; 33 import libcore.icu.NativeConverter; 34 35 /** 36 * A charset is a named mapping between Unicode characters and byte sequences. Every 37 * {@code Charset} can <i>decode</i>, converting a byte sequence into a sequence of characters, 38 * and some can also <i>encode</i>, converting a sequence of characters into a byte sequence. 39 * Use the method {@link #canEncode} to find out whether a charset supports both. 40 * 41 * <h4>Characters</h4> 42 * <p>In the context of this class, <i>character</i> always refers to a Java character: a Unicode 43 * code point in the range U+0000 to U+FFFF. (Java represents supplementary characters using surrogates.) 44 * Not all byte sequences will represent a character, and not 45 * all characters can necessarily be represented by a given charset. The method {@link #contains} 46 * can be used to determine whether every character representable by one charset can also be 47 * represented by another (meaning that a lossless transformation is possible from the contained 48 * to the container). 49 * 50 * <h4>Encodings</h4> 51 * <p>There are many possible ways to represent Unicode characters as byte sequences. 52 * See <a href="http://www.unicode.org/reports/tr17/">UTR#17: Unicode Character Encoding Model</a> 53 * for detailed discussion. 54 * 55 * <p>The most important mappings capable of representing every character are the Unicode 56 * Transformation Format (UTF) charsets. Of those, UTF-8 and the UTF-16 family are the most 57 * common. UTF-8 (described in <a href="http://www.ietf.org/rfc/rfc3629.txt">RFC 3629</a>) 58 * encodes a character using 1 to 4 bytes. UTF-16 uses exactly 2 bytes per character (potentially 59 * wasting space, but allowing efficient random access into BMP text), and UTF-32 uses 60 * exactly 4 bytes per character (trading off even more space for efficient random access into text 61 * that includes supplementary characters). 62 * 63 * <p>UTF-16 and UTF-32 encode characters directly, using their code point as a two- or four-byte 64 * integer. This means that any given UTF-16 or UTF-32 byte sequence is either big- or 65 * little-endian. To assist decoders, Unicode includes a special <i>byte order mark</i> (BOM) 66 * character U+FEFF used to determine the endianness of a sequence. The corresponding byte-swapped 67 * code point U+FFFE is guaranteed never to be assigned. If a UTF-16 decoder sees 68 * {@code 0xfe, 0xff}, for example, it knows it's reading a big-endian byte sequence, while 69 * {@code 0xff, 0xfe}, would indicate a little-endian byte sequence. 70 * 71 * <p>UTF-8 can contain a BOM, but since the UTF-8 encoding of a character always uses the same 72 * byte sequence, there is no information about endianness to convey. Seeing the bytes 73 * corresponding to the UTF-8 encoding of U+FEFF ({@code 0xef, 0xbb, 0xbf}) would only serve to 74 * suggest that you're reading UTF-8. Note that BOMs are decoded as the U+FEFF character, and 75 * will appear in the output character sequence. This means that a disadvantage to including a BOM 76 * in UTF-8 is that most applications that use UTF-8 do not expect to see a BOM. (This is also a 77 * reason to prefer UTF-8: it's one less complication to worry about.) 78 * 79 * <p>Because a BOM indicates how the data that follows should be interpreted, a BOM should occur 80 * as the first character in a character sequence. 81 * 82 * <p>See the <a href="http://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a> for 83 * more about dealing with BOMs. 84 * 85 * <h4>Endianness and BOM behavior</h4> 86 * 87 * <p>The following tables show the endianness and BOM behavior of the UTF-16 variants. 88 * 89 * <p>This table shows what the encoder writes. "BE" means that the byte sequence is big-endian, 90 * "LE" means little-endian. "BE BOM" means a big-endian BOM (that is, {@code 0xfe, 0xff}). 91 * <p><table width="100%"> 92 * <tr> <th>Charset</th> <th>Encoder writes</th> </tr> 93 * <tr> <td>UTF-16BE</td> <td>BE, no BOM</td> </tr> 94 * <tr> <td>UTF-16LE</td> <td>LE, no BOM</td> </tr> 95 * <tr> <td>UTF-16</td> <td>BE, with BE BOM</td> </tr> 96 * </table> 97 * 98 * <p>The next table shows how each variant's decoder behaves when reading a byte sequence. 99 * The exact meaning of "failure" in the table is dependent on the 100 * {@link CodingErrorAction} supplied to {@link CharsetDecoder#malformedInputAction}, so 101 * "BE, failure" means "the byte sequence is treated as big-endian, and a little-endian BOM 102 * triggers the malformedInputAction". 103 * 104 * <p>The phrase "includes BOM" means that the output includes the U+FEFF byte order mark character. 105 * 106 * <p><table width="100%"> 107 * <tr> <th>Charset</th> <th>BE BOM</th> <th>LE BOM</th> <th>No BOM</th> </tr> 108 * <tr> <td>UTF-16BE</td> <td>BE, includes BOM</td> <td>BE, failure</td> <td>BE</td> </tr> 109 * <tr> <td>UTF-16LE</td> <td>LE, failure</td> <td>LE, includes BOM</td> <td>LE</td> </tr> 110 * <tr> <td>UTF-16</td> <td>BE</td> <td>LE</td> <td>BE</td> </tr> 111 * </table> 112 * 113 * <h4>Charset names</h4> 114 * <p>A charset has a canonical name, returned by {@link #name}. Most charsets will 115 * also have one or more aliases, returned by {@link #aliases}. A charset can be looked up 116 * by canonical name or any of its aliases using {@link #forName}. 117 * 118 * <h4>Guaranteed-available charsets</h4> 119 * <p>The following charsets are available on every Java implementation: 120 * <ul> 121 * <li>ISO-8859-1 122 * <li>US-ASCII 123 * <li>UTF-16 124 * <li>UTF-16BE 125 * <li>UTF-16LE 126 * <li>UTF-8 127 * </ul> 128 * <p>All of these charsets support both decoding and encoding. The charsets whose names begin 129 * "UTF" can represent all characters, as mentioned above. The "ISO-8859-1" and "US-ASCII" charsets 130 * can only represent small subsets of these characters. Except when required to do otherwise for 131 * compatibility, new code should use one of the UTF charsets listed above. The platform's default 132 * charset is UTF-8. (This is in contrast to some older implementations, where the default charset 133 * depended on the user's locale.) 134 * 135 * <p>Most implementations will support hundreds of charsets. Use {@link #availableCharsets} or 136 * {@link #isSupported} to see what's available. If you intend to use the charset if it's 137 * available, just call {@link #forName} and catch the exceptions it throws if the charset isn't 138 * available. 139 * 140 * <p>Additional charsets can be made available by configuring one or more charset 141 * providers through provider configuration files. Such files are always named 142 * as "java.nio.charset.spi.CharsetProvider" and located in the 143 * "META-INF/services" directory of one or more classpaths. The files should be 144 * encoded in "UTF-8". Each line of their content specifies the class name of a 145 * charset provider which extends {@link java.nio.charset.spi.CharsetProvider}. 146 * A line should end with '\r', '\n' or '\r\n'. Leading and trailing whitespace 147 * is trimmed. Blank lines, and lines (after trimming) starting with "#" which are 148 * regarded as comments, are both ignored. Duplicates of names already found are also 149 * ignored. Both the configuration files and the provider classes will be loaded 150 * using the thread context class loader. 151 * 152 * <p>Although class is thread-safe, the {@link CharsetDecoder} and {@link CharsetEncoder} instances 153 * it returns are inherently stateful. 154 */ 155 public abstract class Charset implements Comparable<Charset> { 156 private static final HashMap<String, Charset> CACHED_CHARSETS = new HashMap<String, Charset>(); 157 158 private static final Charset DEFAULT_CHARSET = getDefaultCharset(); 159 160 private final String canonicalName; 161 162 private final HashSet<String> aliasesSet; 163 164 /** 165 * Constructs a <code>Charset</code> object. Duplicated aliases are 166 * ignored. 167 * 168 * @param canonicalName 169 * the canonical name of the charset. 170 * @param aliases 171 * an array containing all aliases of the charset. May be null. 172 * @throws IllegalCharsetNameException 173 * on an illegal value being supplied for either 174 * <code>canonicalName</code> or for any element of 175 * <code>aliases</code>. 176 */ Charset(String canonicalName, String[] aliases)177 protected Charset(String canonicalName, String[] aliases) { 178 // Check whether the given canonical name is legal. 179 checkCharsetName(canonicalName); 180 this.canonicalName = canonicalName; 181 182 // Collect and check each unique alias. 183 this.aliasesSet = new HashSet<String>(); 184 if (aliases != null) { 185 for (String alias : aliases) { 186 checkCharsetName(alias); 187 this.aliasesSet.add(alias); 188 } 189 } 190 } 191 checkCharsetName(String name)192 private static void checkCharsetName(String name) { 193 if (name.isEmpty()) { 194 throw new IllegalCharsetNameException(name); 195 } 196 if (!isValidCharsetNameStart(name.charAt(0))) { 197 throw new IllegalCharsetNameException(name); 198 } 199 for (int i = 1; i < name.length(); ++i) { 200 if (!isValidCharsetNamePart(name.charAt(i))) { 201 throw new IllegalCharsetNameException(name); 202 } 203 } 204 } 205 isValidCharsetNameStart(char c)206 private static boolean isValidCharsetNameStart(char c) { 207 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); 208 } 209 isValidCharsetNamePart(char c)210 private static boolean isValidCharsetNamePart(char c) { 211 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || 212 c == '-' || c == '.' || c == ':' || c == '_'; 213 } 214 215 /** 216 * Returns an immutable case-insensitive map from canonical names to {@code Charset} instances. 217 * If multiple charsets have the same canonical name, it is unspecified which is returned in 218 * the map. This method may be slow. If you know which charset you're looking for, use 219 * {@link #forName}. 220 */ availableCharsets()221 public static SortedMap<String, Charset> availableCharsets() { 222 // Start with a copy of the built-in charsets... 223 TreeMap<String, Charset> charsets = new TreeMap<String, Charset>(String.CASE_INSENSITIVE_ORDER); 224 for (String charsetName : NativeConverter.getAvailableCharsetNames()) { 225 Charset charset = NativeConverter.charsetForName(charsetName); 226 charsets.put(charset.name(), charset); 227 } 228 229 // Add all charsets provided by all charset providers... 230 for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) { 231 Iterator<Charset> it = charsetProvider.charsets(); 232 while (it.hasNext()) { 233 Charset cs = it.next(); 234 // A CharsetProvider can't override a built-in Charset. 235 if (!charsets.containsKey(cs.name())) { 236 charsets.put(cs.name(), cs); 237 } 238 } 239 } 240 241 return Collections.unmodifiableSortedMap(charsets); 242 } 243 cacheCharset(String charsetName, Charset cs)244 private static Charset cacheCharset(String charsetName, Charset cs) { 245 synchronized (CACHED_CHARSETS) { 246 // Get the canonical name for this charset, and the canonical instance from the table. 247 String canonicalName = cs.name(); 248 Charset canonicalCharset = CACHED_CHARSETS.get(canonicalName); 249 if (canonicalCharset == null) { 250 canonicalCharset = cs; 251 } 252 253 // Cache the charset by its canonical name... 254 CACHED_CHARSETS.put(canonicalName, canonicalCharset); 255 256 // And the name the user used... (Section 1.4 of http://unicode.org/reports/tr22/ means 257 // that many non-alias, non-canonical names are valid. For example, "utf8" isn't an 258 // alias of the canonical name "UTF-8", but we shouldn't penalize consistent users of 259 // such names unduly.) 260 CACHED_CHARSETS.put(charsetName, canonicalCharset); 261 262 // And all its aliases... 263 for (String alias : cs.aliasesSet) { 264 CACHED_CHARSETS.put(alias, canonicalCharset); 265 } 266 267 return canonicalCharset; 268 } 269 } 270 271 /** 272 * Returns a {@code Charset} instance for the named charset. 273 * 274 * @param charsetName a charset name (either canonical or an alias) 275 * @throws IllegalCharsetNameException 276 * if the specified charset name is illegal. 277 * @throws UnsupportedCharsetException 278 * if the desired charset is not supported by this runtime. 279 */ forName(String charsetName)280 public static Charset forName(String charsetName) { 281 // Is this charset in our cache? 282 Charset cs; 283 synchronized (CACHED_CHARSETS) { 284 cs = CACHED_CHARSETS.get(charsetName); 285 if (cs != null) { 286 return cs; 287 } 288 } 289 290 if (charsetName == null) { 291 throw new IllegalCharsetNameException(null); 292 } 293 294 // Is this a built-in charset supported by ICU? 295 checkCharsetName(charsetName); 296 cs = NativeConverter.charsetForName(charsetName); 297 if (cs != null) { 298 return cacheCharset(charsetName, cs); 299 } 300 301 // Does a configured CharsetProvider have this charset? 302 for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) { 303 cs = charsetProvider.charsetForName(charsetName); 304 if (cs != null) { 305 return cacheCharset(charsetName, cs); 306 } 307 } 308 309 throw new UnsupportedCharsetException(charsetName); 310 } 311 312 /** 313 * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException}, 314 * which is all pre-nio code claims to throw. 315 * 316 * @hide internal use only 317 */ forNameUEE(String charsetName)318 public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException { 319 try { 320 return Charset.forName(charsetName); 321 } catch (Exception cause) { 322 UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName); 323 ex.initCause(cause); 324 throw ex; 325 } 326 } 327 328 /** 329 * Determines whether the specified charset is supported by this runtime. 330 * 331 * @param charsetName 332 * the name of the charset. 333 * @return true if the specified charset is supported, otherwise false. 334 * @throws IllegalCharsetNameException 335 * if the specified charset name is illegal. 336 */ isSupported(String charsetName)337 public static boolean isSupported(String charsetName) { 338 try { 339 forName(charsetName); 340 return true; 341 } catch (UnsupportedCharsetException ex) { 342 return false; 343 } 344 } 345 346 /** 347 * Determines whether this charset is a superset of the given charset. A charset C1 contains 348 * charset C2 if every character representable by C2 is also representable by C1. This means 349 * that lossless conversion is possible from C2 to C1 (but not necessarily the other way 350 * round). It does <i>not</i> imply that the two charsets use the same byte sequences for the 351 * characters they share. 352 * 353 * <p>Note that this method is allowed to be conservative, and some implementations may return 354 * false when this charset does contain the other charset. Android's implementation is precise, 355 * and will always return true in such cases. 356 * 357 * @param charset 358 * a given charset. 359 * @return true if this charset is a super set of the given charset, 360 * false if it's unknown or this charset is not a superset of 361 * the given charset. 362 */ contains(Charset charset)363 public abstract boolean contains(Charset charset); 364 365 /** 366 * Returns a new instance of an encoder for this charset. 367 */ newEncoder()368 public abstract CharsetEncoder newEncoder(); 369 370 /** 371 * Returns a new instance of a decoder for this charset. 372 */ newDecoder()373 public abstract CharsetDecoder newDecoder(); 374 375 /** 376 * Returns the canonical name of this charset. 377 * 378 * <p>If a charset is in the IANA registry, this will be the MIME-preferred name (a charset 379 * may have multiple IANA-registered names). Otherwise the canonical name will begin with "x-" 380 * or "X-". 381 */ name()382 public final String name() { 383 return this.canonicalName; 384 } 385 386 /** 387 * Returns an unmodifiable set of this charset's aliases. 388 */ aliases()389 public final Set<String> aliases() { 390 return Collections.unmodifiableSet(this.aliasesSet); 391 } 392 393 /** 394 * Returns the name of this charset for the default locale. 395 * 396 * <p>The default implementation returns the canonical name of this charset. 397 * Subclasses may return a localized display name. 398 */ displayName()399 public String displayName() { 400 return this.canonicalName; 401 } 402 403 /** 404 * Returns the name of this charset for the specified locale. 405 * 406 * <p>The default implementation returns the canonical name of this charset. 407 * Subclasses may return a localized display name. 408 */ displayName(Locale l)409 public String displayName(Locale l) { 410 return this.canonicalName; 411 } 412 413 /** 414 * Returns true if this charset is known to be registered in the IANA 415 * Charset Registry. 416 */ isRegistered()417 public final boolean isRegistered() { 418 return !canonicalName.startsWith("x-") && !canonicalName.startsWith("X-"); 419 } 420 421 /** 422 * Returns true if this charset supports encoding, false otherwise. 423 */ canEncode()424 public boolean canEncode() { 425 return true; 426 } 427 428 /** 429 * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from 430 * {@code buffer}. 431 * This method uses {@code CodingErrorAction.REPLACE}. 432 * 433 * <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder} 434 * for performance. 435 * 436 * @param buffer 437 * the character buffer containing the content to be encoded. 438 * @return the result of the encoding. 439 */ encode(CharBuffer buffer)440 public final ByteBuffer encode(CharBuffer buffer) { 441 try { 442 return newEncoder() 443 .onMalformedInput(CodingErrorAction.REPLACE) 444 .onUnmappableCharacter(CodingErrorAction.REPLACE).encode( 445 buffer); 446 } catch (CharacterCodingException ex) { 447 throw new Error(ex.getMessage(), ex); 448 } 449 } 450 451 /** 452 * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from {@code s}. 453 * This method uses {@code CodingErrorAction.REPLACE}. 454 * 455 * <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder} 456 * for performance. 457 * 458 * @param s the string to be encoded. 459 * @return the result of the encoding. 460 */ encode(String s)461 public final ByteBuffer encode(String s) { 462 return encode(CharBuffer.wrap(s)); 463 } 464 465 /** 466 * Returns a new {@code CharBuffer} containing the characters decoded from {@code buffer}. 467 * This method uses {@code CodingErrorAction.REPLACE}. 468 * 469 * <p>Applications should generally create a {@link CharsetDecoder} using {@link #newDecoder} 470 * for performance. 471 * 472 * @param buffer 473 * the byte buffer containing the content to be decoded. 474 * @return a character buffer containing the output of the decoding. 475 */ decode(ByteBuffer buffer)476 public final CharBuffer decode(ByteBuffer buffer) { 477 try { 478 return newDecoder() 479 .onMalformedInput(CodingErrorAction.REPLACE) 480 .onUnmappableCharacter(CodingErrorAction.REPLACE).decode(buffer); 481 } catch (CharacterCodingException ex) { 482 throw new Error(ex.getMessage(), ex); 483 } 484 } 485 486 /* 487 * ------------------------------------------------------------------- 488 * Methods implementing parent interface Comparable 489 * ------------------------------------------------------------------- 490 */ 491 492 /** 493 * Compares this charset with the given charset. This comparison is 494 * based on the case insensitive canonical names of the charsets. 495 * 496 * @param charset 497 * the given object to be compared with. 498 * @return a negative integer if less than the given object, a positive 499 * integer if larger than it, or 0 if equal to it. 500 */ compareTo(Charset charset)501 public final int compareTo(Charset charset) { 502 return this.canonicalName.compareToIgnoreCase(charset.canonicalName); 503 } 504 505 /* 506 * ------------------------------------------------------------------- 507 * Methods overriding parent class Object 508 * ------------------------------------------------------------------- 509 */ 510 511 /** 512 * Determines whether this charset equals to the given object. They are 513 * considered to be equal if they have the same canonical name. 514 * 515 * @param obj 516 * the given object to be compared with. 517 * @return true if they have the same canonical name, otherwise false. 518 */ 519 @Override equals(Object obj)520 public final boolean equals(Object obj) { 521 if (obj instanceof Charset) { 522 Charset that = (Charset) obj; 523 return this.canonicalName.equals(that.canonicalName); 524 } 525 return false; 526 } 527 528 /** 529 * Gets the hash code of this charset. 530 * 531 * @return the hash code of this charset. 532 */ 533 @Override hashCode()534 public final int hashCode() { 535 return this.canonicalName.hashCode(); 536 } 537 538 /** 539 * Gets a string representation of this charset. Usually this contains the 540 * canonical name of the charset. 541 * 542 * @return a string representation of this charset. 543 */ 544 @Override toString()545 public final String toString() { 546 return getClass().getName() + "[" + this.canonicalName + "]"; 547 } 548 549 /** 550 * Returns the system's default charset. This is determined during VM startup, and will not 551 * change thereafter. On Android, the default charset is UTF-8. 552 */ defaultCharset()553 public static Charset defaultCharset() { 554 return DEFAULT_CHARSET; 555 } 556 getDefaultCharset()557 private static Charset getDefaultCharset() { 558 String encoding = System.getProperty("file.encoding", "UTF-8"); 559 try { 560 return Charset.forName(encoding); 561 } catch (UnsupportedCharsetException e) { 562 return Charset.forName("UTF-8"); 563 } 564 } 565 } 566