• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2 *******************************************************************************
3 * Copyright (C) 2006-2016, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7 
8 package com.ibm.icu.charset;
9 
10 import java.lang.reflect.Constructor;
11 import java.lang.reflect.InvocationTargetException;
12 import java.nio.charset.Charset;
13 import java.nio.charset.IllegalCharsetNameException;
14 import java.nio.charset.UnsupportedCharsetException;
15 import java.util.HashMap;
16 
17 import com.ibm.icu.text.UnicodeSet;
18 
19 /**
20  * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
21  * This API is used to convert codepage or character encoded data to and
22  * from UTF-16. You can open a converter with {@link Charset#forName} and {@link #forNameICU}. With that
23  * converter, you can get its properties, set options, convert your data.
24  *
25  * <p>Since many software programs recognize different converter names for
26  * different types of converters, there are other functions in this API to
27  * iterate over the converter aliases.
28  *
29  * <p>Note that {@link #name()} cannot always return a unique charset name.
30  * {@link Charset} documents that,
31  * for charsets listed in the IANA Charset Registry,
32  * the {@link #name()} must be listed there,
33  * and it “must be the MIME-preferred name” if there are multiple names.
34  *
35  * <p>However, there are different implementations of many if not most charsets,
36  * ICU provides multiple variants for some of them,
37  * ICU provides variants of some java.nio-system-supported charsets,
38  * and ICU users are free to add more variants.
39  * This is so that applications can be compatible with multiple implementations at the same time.
40  *
41  * <p>This is in conflict with the {@link Charset#name()} requirements.
42  * It is not possible to offer variants of an IANA charset and
43  * always use the MIME-preferred name and also have those names be unique.
44  *
45  * <p>{@link #name()} returns the MIME-preferred name, or IANA name,
46  * so that it can always be used for the charset field in internet protocols.
47  *
48  * <p>Same-name charsets are accessible via {@link Charset#forName} or {@link #forNameICU}
49  * by using unique aliases (e.g., the ICU-canonical names).
50  *
51  * <p>{@link Charset} also documents that
52  * “Two charsets are equal if, and only if, they have the same canonical names.”
53  * This is not possible.
54  *
55  * <p>Unfortunately, {@link Charset#equals} is final, and
56  * {@link Charset#availableCharsets} returns
57  * “a sorted map from canonical charset names to charset objects”.
58  * Since {@link #name()} cannot be unique,
59  * {@link #equals} cannot work properly in such cases, and
60  * {@link Charset#availableCharsets} can only include one variant for a name.
61  *
62  * @stable ICU 3.6
63  */
64 public abstract class CharsetICU extends Charset{
65 
66      String icuCanonicalName;
67      int options;
68 
69      float  maxCharsPerByte;
70 
71      String name; /* +4: 60  internal name of the converter- invariant chars */
72 
73      int codepage;               /* +64: 4 codepage # (now IBM-$codepage) */
74 
75      byte platform;                /* +68: 1 platform of the converter (only IBM now) */
76      byte conversionType;          /* +69: 1 conversion type */
77 
78      int minBytesPerChar;         /* +70: 1 Minimum # bytes per char in this codepage */
79      int maxBytesPerChar;         /* +71: 1 Maximum # bytes output per UChar in this codepage */
80 
81      byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4  [note:  4 and 8 byte boundary] */
82      byte subCharLen;              /* +76: 1 */
83 
84      byte hasToUnicodeFallback;   /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
85      byte hasFromUnicodeFallback; /* +78: 1 */
86      short unicodeMask;            /* +79: 1  bit 0: has supplementary  bit 1: has single surrogates */
87      byte subChar1;               /* +80: 1  single-byte substitution character for IBM MBCS (0 if none) */
88      //byte reserved[/*19*/];           /* +81: 19 to round out the structure */
89 
90 
91     // typedef enum UConverterUnicodeSet {
92      /**
93       * Parameter that select the set of roundtrippable Unicode code points.
94       * @stable ICU 4.0
95       */
96       public static final int ROUNDTRIP_SET=0;
97       /**
98        * Select the set of Unicode code points with roundtrip or fallback mappings.
99        * Not supported at this point.
100        * @internal
101        * @deprecated This API is ICU internal only.
102        */
103       @Deprecated
104       public static final int ROUNDTRIP_AND_FALLBACK_SET =1;
105 
106     //} UConverterUnicodeSet;
107 
108     /**
109      *
110      * @param icuCanonicalName
111      * @param canonicalName
112      * @param aliases
113      * @stable ICU 3.6
114      */
CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases)115     protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
116         super(canonicalName,aliases);
117         if(canonicalName.length() == 0){
118             throw new IllegalCharsetNameException(canonicalName);
119         }
120         this.icuCanonicalName  = icuCanonicalName;
121     }
122 
123     /**
124      * Ascertains if a charset is a sub set of this charset
125      * Implements the abstract method of super class.
126      * @param cs charset to test
127      * @return true if the given charset is a subset of this charset
128      * @stable ICU 3.6
129      */
contains(Charset cs)130     public boolean contains(Charset cs){
131         if (null == cs) {
132             return false;
133         } else if (this.equals(cs)) {
134             return true;
135         }
136         return false;
137     }
138     private static final HashMap<String, String> algorithmicCharsets = new HashMap<String, String>();
139     static{
140         algorithmicCharsets.put("LMBCS-1",               "com.ibm.icu.charset.CharsetLMBCS");
141         algorithmicCharsets.put("LMBCS-2",               "com.ibm.icu.charset.CharsetLMBCS");
142         algorithmicCharsets.put("LMBCS-3",               "com.ibm.icu.charset.CharsetLMBCS");
143         algorithmicCharsets.put("LMBCS-4",               "com.ibm.icu.charset.CharsetLMBCS");
144         algorithmicCharsets.put("LMBCS-5",               "com.ibm.icu.charset.CharsetLMBCS");
145         algorithmicCharsets.put("LMBCS-6",               "com.ibm.icu.charset.CharsetLMBCS");
146         algorithmicCharsets.put("LMBCS-8",               "com.ibm.icu.charset.CharsetLMBCS");
147         algorithmicCharsets.put("LMBCS-11",              "com.ibm.icu.charset.CharsetLMBCS");
148         algorithmicCharsets.put("LMBCS-16",              "com.ibm.icu.charset.CharsetLMBCS");
149         algorithmicCharsets.put("LMBCS-17",              "com.ibm.icu.charset.CharsetLMBCS");
150         algorithmicCharsets.put("LMBCS-18",              "com.ibm.icu.charset.CharsetLMBCS");
151         algorithmicCharsets.put("LMBCS-19",              "com.ibm.icu.charset.CharsetLMBCS");
152         algorithmicCharsets.put("BOCU-1",                "com.ibm.icu.charset.CharsetBOCU1" );
153         algorithmicCharsets.put("SCSU",                  "com.ibm.icu.charset.CharsetSCSU" );
154         algorithmicCharsets.put("US-ASCII",              "com.ibm.icu.charset.CharsetASCII" );
155         algorithmicCharsets.put("ISO-8859-1",            "com.ibm.icu.charset.Charset88591" );
156         algorithmicCharsets.put("UTF-16",                "com.ibm.icu.charset.CharsetUTF16" );
157         algorithmicCharsets.put("UTF-16BE",              "com.ibm.icu.charset.CharsetUTF16BE" );
158         algorithmicCharsets.put("UTF-16BE,version=1",    "com.ibm.icu.charset.CharsetUTF16BE" );
159         algorithmicCharsets.put("UTF-16LE",              "com.ibm.icu.charset.CharsetUTF16LE" );
160         algorithmicCharsets.put("UTF-16LE,version=1",    "com.ibm.icu.charset.CharsetUTF16LE" );
161         algorithmicCharsets.put("UTF16_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF16LE" );
162         algorithmicCharsets.put("UTF16_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF16" );
163         algorithmicCharsets.put("UTF-32",                "com.ibm.icu.charset.CharsetUTF32" );
164         algorithmicCharsets.put("UTF-32BE",              "com.ibm.icu.charset.CharsetUTF32BE" );
165         algorithmicCharsets.put("UTF-32LE",              "com.ibm.icu.charset.CharsetUTF32LE" );
166         algorithmicCharsets.put("UTF32_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF32LE" );
167         algorithmicCharsets.put("UTF32_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF32" );
168         algorithmicCharsets.put("UTF-8",                 "com.ibm.icu.charset.CharsetUTF8" );
169         algorithmicCharsets.put("CESU-8",                "com.ibm.icu.charset.CharsetCESU8" );
170         algorithmicCharsets.put("UTF-7",                 "com.ibm.icu.charset.CharsetUTF7" );
171         algorithmicCharsets.put("ISCII,version=0",       "com.ibm.icu.charset.CharsetISCII" );
172         algorithmicCharsets.put("ISCII,version=1",       "com.ibm.icu.charset.CharsetISCII" );
173         algorithmicCharsets.put("ISCII,version=2",       "com.ibm.icu.charset.CharsetISCII" );
174         algorithmicCharsets.put("ISCII,version=3",       "com.ibm.icu.charset.CharsetISCII" );
175         algorithmicCharsets.put("ISCII,version=4",       "com.ibm.icu.charset.CharsetISCII" );
176         algorithmicCharsets.put("ISCII,version=5",       "com.ibm.icu.charset.CharsetISCII" );
177         algorithmicCharsets.put("ISCII,version=6",       "com.ibm.icu.charset.CharsetISCII" );
178         algorithmicCharsets.put("ISCII,version=7",       "com.ibm.icu.charset.CharsetISCII" );
179         algorithmicCharsets.put("ISCII,version=8",       "com.ibm.icu.charset.CharsetISCII" );
180         algorithmicCharsets.put("IMAP-mailbox-name",     "com.ibm.icu.charset.CharsetUTF7" );
181         algorithmicCharsets.put("HZ",                    "com.ibm.icu.charset.CharsetHZ" );
182         algorithmicCharsets.put("ISO_2022,locale=ja,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
183         algorithmicCharsets.put("ISO_2022,locale=ja,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
184         algorithmicCharsets.put("ISO_2022,locale=ja,version=2",               "com.ibm.icu.charset.CharsetISO2022" );
185         algorithmicCharsets.put("ISO_2022,locale=ja,version=3",               "com.ibm.icu.charset.CharsetISO2022" );
186         algorithmicCharsets.put("ISO_2022,locale=ja,version=4",               "com.ibm.icu.charset.CharsetISO2022" );
187         algorithmicCharsets.put("ISO_2022,locale=zh,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
188         algorithmicCharsets.put("ISO_2022,locale=zh,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
189         algorithmicCharsets.put("ISO_2022,locale=zh,version=2",               "com.ibm.icu.charset.CharsetISO2022" );
190         algorithmicCharsets.put("ISO_2022,locale=ko,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
191         algorithmicCharsets.put("ISO_2022,locale=ko,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
192         algorithmicCharsets.put("x11-compound-text",                          "com.ibm.icu.charset.CharsetCompoundText" );
193         }
194 
getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases)195     /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
196        String className = algorithmicCharsets.get(icuCanonicalName);
197        if(className==null){
198            //all the cnv files are loaded as MBCS
199            className = "com.ibm.icu.charset.CharsetMBCS";
200        }
201        try{
202            CharsetICU conv = null;
203            Class<? extends CharsetICU> cs = Class.forName(className).asSubclass(CharsetICU.class);
204            Class<?>[] paramTypes = new Class<?>[]{ String.class, String.class,  String[].class};
205            final Constructor<? extends CharsetICU> c = cs.getConstructor(paramTypes);
206            Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
207 
208            // Run constructor
209            try {
210                conv = c.newInstance(params);
211                if (conv != null) {
212                    return conv;
213                }
214            }catch (InvocationTargetException e) {
215                Throwable cause = e.getCause();
216                UnsupportedCharsetException e2 = new UnsupportedCharsetException(
217                        icuCanonicalName + ": " + "Could not load " + className + ". Exception: " + cause);
218                e2.initCause(cause);
219                throw e2;
220            }
221        }catch(ClassNotFoundException ex){
222        }catch(NoSuchMethodException ex){
223        }catch (IllegalAccessException ex){
224        }catch (InstantiationException ex){
225        }
226        throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
227     }
228 
isSurrogate(int c)229     static final boolean isSurrogate(int c){
230         return (((c)&0xfffff800)==0xd800);
231     }
232 
233     /*
234      * Returns the default charset name
235      */
236 //    static final String getDefaultCharsetName(){
237 //        String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
238 //        return defaultEncoding;
239 //    }
240 
241     /**
242      * Returns a charset object for the named charset.
243      * This method gurantee that ICU charset is returned when
244      * available.  If the ICU charset provider does not support
245      * the specified charset, then try other charset providers
246      * including the standard Java charset provider.
247      *
248      * @param charsetName The name of the requested charset,
249      * may be either a canonical name or an alias
250      * @return A charset object for the named charset
251      * @throws IllegalCharsetNameException If the given charset name
252      * is illegal
253      * @throws UnsupportedCharsetException If no support for the
254      * named charset is available in this instance of th Java
255      * virtual machine
256      * @stable ICU 3.6
257      */
forNameICU(String charsetName)258     public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
259         CharsetProviderICU icuProvider = new CharsetProviderICU();
260         CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
261         if (cs != null) {
262             return cs;
263         }
264         return Charset.forName(charsetName);
265     }
266 
267 //    /**
268 //     * @see java.lang.Comparable#compareTo(java.lang.Object)
269 //     * @stable 3.8
270 //     */
271 //    public int compareTo(Object otherObj) {
272 //        if (!(otherObj instanceof CharsetICU)) {
273 //            return -1;
274 //        }
275 //        return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);
276 //    }
277 
278     /**
279      * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
280      * start of the stream for example U+FEFF (the Unicode BOM/signature
281      * character) that can be ignored.
282      *
283      * Detects Unicode signature byte sequences at the start of the byte stream
284      * and returns number of bytes of the BOM of the indicated Unicode charset.
285      * 0 is returned when no Unicode signature is recognized.
286      *
287      */
288     // TODO This should be proposed as CharsetDecoderICU API.
289 //    static String detectUnicodeSignature(ByteBuffer source) {
290 //        int signatureLength = 0; // number of bytes of the signature
291 //        final int SIG_MAX_LEN = 5;
292 //        String sigUniCharset = null; // states what unicode charset is the BOM
293 //        int i = 0;
294 //
295 //        /*
296 //         * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
297 //         * don't misdetect something
298 //         */
299 //        byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
300 //                (byte) 0xa5 };
301 //
302 //        while (i < source.remaining() && i < SIG_MAX_LEN) {
303 //            start[i] = source.get(i);
304 //            i++;
305 //        }
306 //
307 //        if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
308 //            signatureLength = 2;
309 //            sigUniCharset = "UTF-16BE";
310 //            source.position(signatureLength);
311 //            return sigUniCharset;
312 //        } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
313 //            if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
314 //                signatureLength = 4;
315 //                sigUniCharset = "UTF-32LE";
316 //                source.position(signatureLength);
317 //                return sigUniCharset;
318 //            } else {
319 //                signatureLength = 2;
320 //                sigUniCharset = "UTF-16LE";
321 //                source.position(signatureLength);
322 //                return sigUniCharset;
323 //            }
324 //        } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
325 //                && start[2] == (byte) 0xBF) {
326 //            signatureLength = 3;
327 //            sigUniCharset = "UTF-8";
328 //            source.position(signatureLength);
329 //            return sigUniCharset;
330 //        } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
331 //                && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
332 //            signatureLength = 4;
333 //            sigUniCharset = "UTF-32BE";
334 //            source.position(signatureLength);
335 //            return sigUniCharset;
336 //        } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
337 //                && start[2] == (byte) 0xFF) {
338 //            signatureLength = 3;
339 //            sigUniCharset = "SCSU";
340 //            source.position(signatureLength);
341 //            return sigUniCharset;
342 //        } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
343 //                && start[2] == (byte) 0x28) {
344 //            signatureLength = 3;
345 //            sigUniCharset = "BOCU-1";
346 //            source.position(signatureLength);
347 //            return sigUniCharset;
348 //        } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
349 //                && start[2] == (byte) 0x76) {
350 //
351 //            if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
352 //                signatureLength = 5;
353 //                sigUniCharset = "UTF-7";
354 //                source.position(signatureLength);
355 //                return sigUniCharset;
356 //            } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
357 //                    || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
358 //                signatureLength = 4;
359 //                sigUniCharset = "UTF-7";
360 //                source.position(signatureLength);
361 //                return sigUniCharset;
362 //            }
363 //        } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
364 //                && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
365 //            signatureLength = 4;
366 //            sigUniCharset = "UTF-EBCDIC";
367 //            source.position(signatureLength);
368 //            return sigUniCharset;
369 //        }
370 //
371 //        /* no known Unicode signature byte sequence recognized */
372 //        return null;
373 //    }
374 
375 
getUnicodeSetImpl(UnicodeSet setFillIn, int which)376     abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);
377 
378     /**
379     * Returns the set of Unicode code points that can be converted by an ICU Converter.
380     *
381     * <p>The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be
382     * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback
383     * mappings or are only the result of reverse fallback mappings.  See UTR #22 "Character Mapping Markup Language" at  <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a>
384     *
385     * <p>In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.
386     *
387     * <p>This is useful for example for
388     * <ul><li>checking that a string or document can be roundtrip-converted with a converter,
389     *   without/before actually performing the conversion</li>
390     * <li>testing if a converter can be used for text for typical text for a certain locale,
391     *   by comparing its roundtrip set with the set of ExemplarCharacters from
392     *   ICU's locale data or other sources</li></ul>
393     *
394     * @param setFillIn A valid UnicodeSet. It will be cleared by this function before
395     *                   the converter's specific set is filled in.
396     * @param which A selector; currently ROUNDTRIP_SET is the only supported value.
397     * @throws IllegalArgumentException if the parameters does not match.
398     * @stable ICU 4.0
399     */
getUnicodeSet(UnicodeSet setFillIn, int which)400        public void getUnicodeSet(UnicodeSet setFillIn, int which){
401            if( setFillIn == null || which != ROUNDTRIP_SET ){
402                throw new IllegalArgumentException();
403            }
404            setFillIn.clear();
405            getUnicodeSetImpl(setFillIn, which);
406        }
407 
408        /**
409         * Returns whether or not the charset of the converter has a fixed number of bytes
410         * per charset character.
411         * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS.
412         * Another example is UTF-32 which is always 4 bytes per character.  A UTF-32 code point
413         * may represent more than one UTF-8 or UTF-16 code units but always have size of 4 bytes.
414         * Note: This method is not intended to be used to determine whether the charset has a
415         * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form.
416         * @return true if the converter is fixed-width
417         * @stable ICU 4.8
418         */
isFixedWidth()419        public boolean isFixedWidth() {
420            if (this instanceof CharsetASCII || this instanceof CharsetUTF32) {
421                return true;
422            }
423 
424            if (this instanceof CharsetMBCS) {
425                if (((CharsetMBCS)this).sharedData.staticData.maxBytesPerChar == ((CharsetMBCS)this).sharedData.staticData.minBytesPerChar) {
426                    return true;
427                }
428            }
429 
430            return false;
431        }
432 
getNonSurrogateUnicodeSet(UnicodeSet setFillIn)433        static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
434            setFillIn.add(0, 0xd7ff);
435            setFillIn.add(0xe000, 0x10ffff);
436        }
437 
getCompleteUnicodeSet(UnicodeSet setFillIn)438        static void getCompleteUnicodeSet(UnicodeSet setFillIn){
439            setFillIn.add(0, 0x10ffff);
440        }
441 }
442