1 /*
2  *  Licensed to the Apache Software Foundation (ASF) under one or more
3  *  contributor license agreements.  See the NOTICE file distributed with
4  *  this work for additional information regarding copyright ownership.
5  *  The ASF licenses this file to You under the Apache License, Version 2.0
6  *  (the "License"); you may not use this file except in compliance with
7  *  the License.  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  */
17 
18 package java.nio.charset;
19 
20 import java.io.UnsupportedEncodingException;
21 import java.nio.ByteBuffer;
22 import java.nio.CharBuffer;
23 import java.nio.charset.spi.CharsetProvider;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.Iterator;
28 import java.util.Locale;
29 import java.util.ServiceLoader;
30 import java.util.Set;
31 import java.util.SortedMap;
32 import java.util.TreeMap;
33 import libcore.icu.NativeConverter;
34 
35 /**
36  * A charset is a named mapping between Unicode characters and byte sequences. Every
37  * {@code Charset} can <i>decode</i>, converting a byte sequence into a sequence of characters,
38  * and some can also <i>encode</i>, converting a sequence of characters into a byte sequence.
39  * Use the method {@link #canEncode} to find out whether a charset supports both.
40  *
41  * <h4>Characters</h4>
42  * <p>In the context of this class, <i>character</i> always refers to a Java character: a Unicode
43  * code point in the range U+0000 to U+FFFF. (Java represents supplementary characters using surrogates.)
44  * Not all byte sequences will represent a character, and not
45  * all characters can necessarily be represented by a given charset. The method {@link #contains}
46  * can be used to determine whether every character representable by one charset can also be
47  * represented by another (meaning that a lossless transformation is possible from the contained
48  * to the container).
49  *
50  * <h4>Encodings</h4>
51  * <p>There are many possible ways to represent Unicode characters as byte sequences.
52  * See <a href="http://www.unicode.org/reports/tr17/">UTR#17: Unicode Character Encoding Model</a>
53  * for detailed discussion.
54  *
55  * <p>The most important mappings capable of representing every character are the Unicode
56  * Transformation Format (UTF) charsets. Of those, UTF-8 and the UTF-16 family are the most
57  * common. UTF-8 (described in <a href="http://www.ietf.org/rfc/rfc3629.txt">RFC 3629</a>)
58  * encodes a character using 1 to 4 bytes. UTF-16 uses exactly 2 bytes per character (potentially
59  * wasting space, but allowing efficient random access into BMP text), and UTF-32 uses
60  * exactly 4 bytes per character (trading off even more space for efficient random access into text
61  * that includes supplementary characters).
62  *
63  * <p>UTF-16 and UTF-32 encode characters directly, using their code point as a two- or four-byte
64  * integer. This means that any given UTF-16 or UTF-32 byte sequence is either big- or
65  * little-endian. To assist decoders, Unicode includes a special <i>byte order mark</i> (BOM)
66  * character U+FEFF used to determine the endianness of a sequence. The corresponding byte-swapped
67  * code point U+FFFE is guaranteed never to be assigned. If a UTF-16 decoder sees
68  * {@code 0xfe, 0xff}, for example, it knows it's reading a big-endian byte sequence, while
69  * {@code 0xff, 0xfe}, would indicate a little-endian byte sequence.
70  *
71  * <p>UTF-8 can contain a BOM, but since the UTF-8 encoding of a character always uses the same
72  * byte sequence, there is no information about endianness to convey. Seeing the bytes
73  * corresponding to the UTF-8 encoding of U+FEFF ({@code 0xef, 0xbb, 0xbf}) would only serve to
74  * suggest that you're reading UTF-8. Note that BOMs are decoded as the U+FEFF character, and
75  * will appear in the output character sequence. This means that a disadvantage to including a BOM
76  * in UTF-8 is that most applications that use UTF-8 do not expect to see a BOM. (This is also a
77  * reason to prefer UTF-8: it's one less complication to worry about.)
78  *
79  * <p>Because a BOM indicates how the data that follows should be interpreted, a BOM should occur
80  * as the first character in a character sequence.
81  *
82  * <p>See the <a href="http://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a> for
83  * more about dealing with BOMs.
84  *
85  * <h4>Endianness and BOM behavior</h4>
86  *
87  * <p>The following tables show the endianness and BOM behavior of the UTF-16 variants.
88  *
89  * <p>This table shows what the encoder writes. "BE" means that the byte sequence is big-endian,
90  * "LE" means little-endian. "BE BOM" means a big-endian BOM (that is, {@code 0xfe, 0xff}).
91  * <p><table width="100%">
92  * <tr> <th>Charset</th>  <th>Encoder writes</th>  </tr>
93  * <tr> <td>UTF-16BE</td> <td>BE, no BOM</td>      </tr>
94  * <tr> <td>UTF-16LE</td> <td>LE, no BOM</td>      </tr>
95  * <tr> <td>UTF-16</td>   <td>BE, with BE BOM</td> </tr>
96  * </table>
97  *
98  * <p>The next table shows how each variant's decoder behaves when reading a byte sequence.
99  * The exact meaning of "failure" in the table is dependent on the
100  * {@link CodingErrorAction} supplied to {@link CharsetDecoder#malformedInputAction}, so
101  * "BE, failure" means "the byte sequence is treated as big-endian, and a little-endian BOM
102  * triggers the malformedInputAction".
103  *
104  * <p>The phrase "includes BOM" means that the output includes the U+FEFF byte order mark character.
105  *
106  * <p><table width="100%">
107  * <tr> <th>Charset</th>  <th>BE BOM</th>           <th>LE BOM</th>           <th>No BOM</th> </tr>
108  * <tr> <td>UTF-16BE</td> <td>BE, includes BOM</td> <td>BE, failure</td>      <td>BE</td>     </tr>
109  * <tr> <td>UTF-16LE</td> <td>LE, failure</td>      <td>LE, includes BOM</td> <td>LE</td>     </tr>
110  * <tr> <td>UTF-16</td>   <td>BE</td>               <td>LE</td>               <td>BE</td>     </tr>
111  * </table>
112  *
113  * <h4>Charset names</h4>
114  * <p>A charset has a canonical name, returned by {@link #name}. Most charsets will
115  * also have one or more aliases, returned by {@link #aliases}. A charset can be looked up
116  * by canonical name or any of its aliases using {@link #forName}.
117  *
118  * <h4>Guaranteed-available charsets</h4>
119  * <p>The following charsets are available on every Java implementation:
120  * <ul>
121  * <li>ISO-8859-1
122  * <li>US-ASCII
123  * <li>UTF-16
124  * <li>UTF-16BE
125  * <li>UTF-16LE
126  * <li>UTF-8
127  * </ul>
128  * <p>All of these charsets support both decoding and encoding. The charsets whose names begin
129  * "UTF" can represent all characters, as mentioned above. The "ISO-8859-1" and "US-ASCII" charsets
130  * can only represent small subsets of these characters. Except when required to do otherwise for
131  * compatibility, new code should use one of the UTF charsets listed above. The platform's default
132  * charset is UTF-8. (This is in contrast to some older implementations, where the default charset
133  * depended on the user's locale.)
134  *
135  * <p>Most implementations will support hundreds of charsets. Use {@link #availableCharsets} or
136  * {@link #isSupported} to see what's available. If you intend to use the charset if it's
137  * available, just call {@link #forName} and catch the exceptions it throws if the charset isn't
138  * available.
139  *
140  * <p>Additional charsets can be made available by configuring one or more charset
141  * providers through provider configuration files. Such files are always named
142  * as "java.nio.charset.spi.CharsetProvider" and located in the
143  * "META-INF/services" directory of one or more classpaths. The files should be
144  * encoded in "UTF-8". Each line of their content specifies the class name of a
145  * charset provider which extends {@link java.nio.charset.spi.CharsetProvider}.
146  * A line should end with '\r', '\n' or '\r\n'. Leading and trailing whitespace
147  * is trimmed. Blank lines, and lines (after trimming) starting with "#" which are
148  * regarded as comments, are both ignored. Duplicates of names already found are also
149  * ignored. Both the configuration files and the provider classes will be loaded
150  * using the thread context class loader.
151  *
152  * <p>Although class is thread-safe, the {@link CharsetDecoder} and {@link CharsetEncoder} instances
153  * it returns are inherently stateful.
154  */
155 public abstract class Charset implements Comparable<Charset> {
156     private static final HashMap<String, Charset> CACHED_CHARSETS = new HashMap<String, Charset>();
157 
158     private static final Charset DEFAULT_CHARSET = getDefaultCharset();
159 
160     private final String canonicalName;
161 
162     private final HashSet<String> aliasesSet;
163 
164     /**
165      * Constructs a <code>Charset</code> object. Duplicated aliases are
166      * ignored.
167      *
168      * @param canonicalName
169      *            the canonical name of the charset.
170      * @param aliases
171      *            an array containing all aliases of the charset. May be null.
172      * @throws IllegalCharsetNameException
173      *             on an illegal value being supplied for either
174      *             <code>canonicalName</code> or for any element of
175      *             <code>aliases</code>.
176      */
Charset(String canonicalName, String[] aliases)177     protected Charset(String canonicalName, String[] aliases) {
178         // Check whether the given canonical name is legal.
179         checkCharsetName(canonicalName);
180         this.canonicalName = canonicalName;
181 
182         // Collect and check each unique alias.
183         this.aliasesSet = new HashSet<String>();
184         if (aliases != null) {
185             for (String alias : aliases) {
186                 checkCharsetName(alias);
187                 this.aliasesSet.add(alias);
188             }
189         }
190     }
191 
checkCharsetName(String name)192     private static void checkCharsetName(String name) {
193         if (name.isEmpty()) {
194             throw new IllegalCharsetNameException(name);
195         }
196         if (!isValidCharsetNameStart(name.charAt(0))) {
197             throw new IllegalCharsetNameException(name);
198         }
199         for (int i = 1; i < name.length(); ++i) {
200             if (!isValidCharsetNamePart(name.charAt(i))) {
201                 throw new IllegalCharsetNameException(name);
202             }
203         }
204     }
205 
isValidCharsetNameStart(char c)206     private static boolean isValidCharsetNameStart(char c) {
207         return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
208     }
209 
isValidCharsetNamePart(char c)210     private static boolean isValidCharsetNamePart(char c) {
211         return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
212                 c == '-' || c == '.' || c == ':' || c == '_';
213     }
214 
215     /**
216      * Returns an immutable case-insensitive map from canonical names to {@code Charset} instances.
217      * If multiple charsets have the same canonical name, it is unspecified which is returned in
218      * the map. This method may be slow. If you know which charset you're looking for, use
219      * {@link #forName}.
220      */
availableCharsets()221     public static SortedMap<String, Charset> availableCharsets() {
222         // Start with a copy of the built-in charsets...
223         TreeMap<String, Charset> charsets = new TreeMap<String, Charset>(String.CASE_INSENSITIVE_ORDER);
224         for (String charsetName : NativeConverter.getAvailableCharsetNames()) {
225             Charset charset = NativeConverter.charsetForName(charsetName);
226             charsets.put(charset.name(), charset);
227         }
228 
229         // Add all charsets provided by all charset providers...
230         for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) {
231             Iterator<Charset> it = charsetProvider.charsets();
232             while (it.hasNext()) {
233                 Charset cs = it.next();
234                 // A CharsetProvider can't override a built-in Charset.
235                 if (!charsets.containsKey(cs.name())) {
236                     charsets.put(cs.name(), cs);
237                 }
238             }
239         }
240 
241         return Collections.unmodifiableSortedMap(charsets);
242     }
243 
cacheCharset(String charsetName, Charset cs)244     private static Charset cacheCharset(String charsetName, Charset cs) {
245         synchronized (CACHED_CHARSETS) {
246             // Get the canonical name for this charset, and the canonical instance from the table.
247             String canonicalName = cs.name();
248             Charset canonicalCharset = CACHED_CHARSETS.get(canonicalName);
249             if (canonicalCharset == null) {
250                 canonicalCharset = cs;
251             }
252 
253             // Cache the charset by its canonical name...
254             CACHED_CHARSETS.put(canonicalName, canonicalCharset);
255 
256             // And the name the user used... (Section 1.4 of http://unicode.org/reports/tr22/ means
257             // that many non-alias, non-canonical names are valid. For example, "utf8" isn't an
258             // alias of the canonical name "UTF-8", but we shouldn't penalize consistent users of
259             // such names unduly.)
260             CACHED_CHARSETS.put(charsetName, canonicalCharset);
261 
262             // And all its aliases...
263             for (String alias : cs.aliasesSet) {
264                 CACHED_CHARSETS.put(alias, canonicalCharset);
265             }
266 
267             return canonicalCharset;
268         }
269     }
270 
271     /**
272      * Returns a {@code Charset} instance for the named charset.
273      *
274      * @param charsetName a charset name (either canonical or an alias)
275      * @throws IllegalCharsetNameException
276      *             if the specified charset name is illegal.
277      * @throws UnsupportedCharsetException
278      *             if the desired charset is not supported by this runtime.
279      */
forName(String charsetName)280     public static Charset forName(String charsetName) {
281         // Is this charset in our cache?
282         Charset cs;
283         synchronized (CACHED_CHARSETS) {
284             cs = CACHED_CHARSETS.get(charsetName);
285             if (cs != null) {
286                 return cs;
287             }
288         }
289 
290         if (charsetName == null) {
291             throw new IllegalCharsetNameException(null);
292         }
293 
294         // Is this a built-in charset supported by ICU?
295         checkCharsetName(charsetName);
296         cs = NativeConverter.charsetForName(charsetName);
297         if (cs != null) {
298             return cacheCharset(charsetName, cs);
299         }
300 
301         // Does a configured CharsetProvider have this charset?
302         for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) {
303             cs = charsetProvider.charsetForName(charsetName);
304             if (cs != null) {
305                 return cacheCharset(charsetName, cs);
306             }
307         }
308 
309         throw new UnsupportedCharsetException(charsetName);
310     }
311 
312     /**
313      * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException},
314      * which is all pre-nio code claims to throw.
315      *
316      * @hide internal use only
317      */
forNameUEE(String charsetName)318     public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException {
319         try {
320             return Charset.forName(charsetName);
321         } catch (Exception cause) {
322             UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName);
323             ex.initCause(cause);
324             throw ex;
325         }
326     }
327 
328     /**
329      * Determines whether the specified charset is supported by this runtime.
330      *
331      * @param charsetName
332      *            the name of the charset.
333      * @return true if the specified charset is supported, otherwise false.
334      * @throws IllegalCharsetNameException
335      *             if the specified charset name is illegal.
336      */
isSupported(String charsetName)337     public static boolean isSupported(String charsetName) {
338         try {
339             forName(charsetName);
340             return true;
341         } catch (UnsupportedCharsetException ex) {
342             return false;
343         }
344     }
345 
346     /**
347      * Determines whether this charset is a superset of the given charset. A charset C1 contains
348      * charset C2 if every character representable by C2 is also representable by C1. This means
349      * that lossless conversion is possible from C2 to C1 (but not necessarily the other way
350      * round). It does <i>not</i> imply that the two charsets use the same byte sequences for the
351      * characters they share.
352      *
353      * <p>Note that this method is allowed to be conservative, and some implementations may return
354      * false when this charset does contain the other charset. Android's implementation is precise,
355      * and will always return true in such cases.
356      *
357      * @param charset
358      *            a given charset.
359      * @return true if this charset is a super set of the given charset,
360      *         false if it's unknown or this charset is not a superset of
361      *         the given charset.
362      */
contains(Charset charset)363     public abstract boolean contains(Charset charset);
364 
365     /**
366      * Returns a new instance of an encoder for this charset.
367      */
newEncoder()368     public abstract CharsetEncoder newEncoder();
369 
370     /**
371      * Returns a new instance of a decoder for this charset.
372      */
newDecoder()373     public abstract CharsetDecoder newDecoder();
374 
375     /**
376      * Returns the canonical name of this charset.
377      *
378      * <p>If a charset is in the IANA registry, this will be the MIME-preferred name (a charset
379      * may have multiple IANA-registered names). Otherwise the canonical name will begin with "x-"
380      * or "X-".
381      */
name()382     public final String name() {
383         return this.canonicalName;
384     }
385 
386     /**
387      * Returns an unmodifiable set of this charset's aliases.
388      */
aliases()389     public final Set<String> aliases() {
390         return Collections.unmodifiableSet(this.aliasesSet);
391     }
392 
393     /**
394      * Returns the name of this charset for the default locale.
395      *
396      * <p>The default implementation returns the canonical name of this charset.
397      * Subclasses may return a localized display name.
398      */
displayName()399     public String displayName() {
400         return this.canonicalName;
401     }
402 
403     /**
404      * Returns the name of this charset for the specified locale.
405      *
406      * <p>The default implementation returns the canonical name of this charset.
407      * Subclasses may return a localized display name.
408      */
displayName(Locale l)409     public String displayName(Locale l) {
410         return this.canonicalName;
411     }
412 
413     /**
414      * Returns true if this charset is known to be registered in the IANA
415      * Charset Registry.
416      */
isRegistered()417     public final boolean isRegistered() {
418         return !canonicalName.startsWith("x-") && !canonicalName.startsWith("X-");
419     }
420 
421     /**
422      * Returns true if this charset supports encoding, false otherwise.
423      */
canEncode()424     public boolean canEncode() {
425         return true;
426     }
427 
428     /**
429      * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from
430      * {@code buffer}.
431      * This method uses {@code CodingErrorAction.REPLACE}.
432      *
433      * <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
434      * for performance.
435      *
436      * @param buffer
437      *            the character buffer containing the content to be encoded.
438      * @return the result of the encoding.
439      */
encode(CharBuffer buffer)440     public final ByteBuffer encode(CharBuffer buffer) {
441         try {
442             return newEncoder()
443                     .onMalformedInput(CodingErrorAction.REPLACE)
444                     .onUnmappableCharacter(CodingErrorAction.REPLACE).encode(
445                             buffer);
446         } catch (CharacterCodingException ex) {
447             throw new Error(ex.getMessage(), ex);
448         }
449     }
450 
451     /**
452      * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from {@code s}.
453      * This method uses {@code CodingErrorAction.REPLACE}.
454      *
455      * <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
456      * for performance.
457      *
458      * @param s the string to be encoded.
459      * @return the result of the encoding.
460      */
encode(String s)461     public final ByteBuffer encode(String s) {
462         return encode(CharBuffer.wrap(s));
463     }
464 
465     /**
466      * Returns a new {@code CharBuffer} containing the characters decoded from {@code buffer}.
467      * This method uses {@code CodingErrorAction.REPLACE}.
468      *
469      * <p>Applications should generally create a {@link CharsetDecoder} using {@link #newDecoder}
470      * for performance.
471      *
472      * @param buffer
473      *            the byte buffer containing the content to be decoded.
474      * @return a character buffer containing the output of the decoding.
475      */
decode(ByteBuffer buffer)476     public final CharBuffer decode(ByteBuffer buffer) {
477         try {
478             return newDecoder()
479                     .onMalformedInput(CodingErrorAction.REPLACE)
480                     .onUnmappableCharacter(CodingErrorAction.REPLACE).decode(buffer);
481         } catch (CharacterCodingException ex) {
482             throw new Error(ex.getMessage(), ex);
483         }
484     }
485 
486     /*
487      * -------------------------------------------------------------------
488      * Methods implementing parent interface Comparable
489      * -------------------------------------------------------------------
490      */
491 
492     /**
493      * Compares this charset with the given charset. This comparison is
494      * based on the case insensitive canonical names of the charsets.
495      *
496      * @param charset
497      *            the given object to be compared with.
498      * @return a negative integer if less than the given object, a positive
499      *         integer if larger than it, or 0 if equal to it.
500      */
compareTo(Charset charset)501     public final int compareTo(Charset charset) {
502         return this.canonicalName.compareToIgnoreCase(charset.canonicalName);
503     }
504 
505     /*
506      * -------------------------------------------------------------------
507      * Methods overriding parent class Object
508      * -------------------------------------------------------------------
509      */
510 
511     /**
512      * Determines whether this charset equals to the given object. They are
513      * considered to be equal if they have the same canonical name.
514      *
515      * @param obj
516      *            the given object to be compared with.
517      * @return true if they have the same canonical name, otherwise false.
518      */
519     @Override
equals(Object obj)520     public final boolean equals(Object obj) {
521         if (obj instanceof Charset) {
522             Charset that = (Charset) obj;
523             return this.canonicalName.equals(that.canonicalName);
524         }
525         return false;
526     }
527 
528     /**
529      * Gets the hash code of this charset.
530      *
531      * @return the hash code of this charset.
532      */
533     @Override
hashCode()534     public final int hashCode() {
535         return this.canonicalName.hashCode();
536     }
537 
538     /**
539      * Gets a string representation of this charset. Usually this contains the
540      * canonical name of the charset.
541      *
542      * @return a string representation of this charset.
543      */
544     @Override
toString()545     public final String toString() {
546         return getClass().getName() + "[" + this.canonicalName + "]";
547     }
548 
549     /**
550      * Returns the system's default charset. This is determined during VM startup, and will not
551      * change thereafter. On Android, the default charset is UTF-8.
552      */
defaultCharset()553     public static Charset defaultCharset() {
554         return DEFAULT_CHARSET;
555     }
556 
getDefaultCharset()557     private static Charset getDefaultCharset() {
558         String encoding = System.getProperty("file.encoding", "UTF-8");
559         try {
560             return Charset.forName(encoding);
561         } catch (UnsupportedCharsetException e) {
562             return Charset.forName("UTF-8");
563         }
564     }
565 }
566