/****************************************************************
* Licensed to the Apache Software Foundation (ASF) under one *
* or more contributor license agreements. See the NOTICE file *
* distributed with this work for additional information *
* regarding copyright ownership. The ASF licenses this file *
* to you under the Apache License, Version 2.0 (the *
* "License"); you may not use this file except in compliance *
* with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, *
* software distributed under the License is distributed on an *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
* KIND, either express or implied. See the License for the *
* specific language governing permissions and limitations *
* under the License. *
****************************************************************/
package org.apache.james.mime4j.util;
import java.io.UnsupportedEncodingException;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;
import java.util.Locale;
import java.util.TreeSet;
//BEGIN android-changed: Stubbing out logging
import org.apache.james.mime4j.Log;
import org.apache.james.mime4j.LogFactory;
//END android-changed
/**
* Utility class for working with character sets. It is somewhat similar to
* the Java 1.4 java.nio.charset.Charset
class but knows many
* more aliases and is compatible with Java 1.3. It will use a simple detection
* mechanism to detect what character sets the current VM supports. This will
* be a sub-set of the character sets listed in the
*
* Java 1.5 (J2SE5.0) Supported Encodings document.
*
* The * IANA Character Sets document has been used to determine the preferred * MIME character set names and to get a list of known aliases. *
* This is a complete list of the character sets known to this class: *
Canonical (Java) name | *MIME preferred | *Aliases | *
ASCII | *US-ASCII | *ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983 | *
Big5 | *Big5 | *csBig5 CN-Big5 BIG-FIVE BIGFIVE | *
Big5_HKSCS | *Big5-HKSCS | *big5hkscs | *
Big5_Solaris | *? | ** |
Cp037 | *IBM037 | *ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 | *
Cp1006 | *? | ** |
Cp1025 | *? | ** |
Cp1026 | *IBM1026 | *csIBM1026 | *
Cp1046 | *? | ** |
Cp1047 | *IBM1047 | *IBM-1047 | *
Cp1097 | *? | ** |
Cp1098 | *? | ** |
Cp1112 | *? | ** |
Cp1122 | *? | ** |
Cp1123 | *? | ** |
Cp1124 | *? | ** |
Cp1140 | *IBM01140 | *CCSID01140 CP01140 ebcdic-us-37+euro | *
Cp1141 | *IBM01141 | *CCSID01141 CP01141 ebcdic-de-273+euro | *
Cp1142 | *IBM01142 | *CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro | *
Cp1143 | *IBM01143 | *CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro | *
Cp1144 | *IBM01144 | *CCSID01144 CP01144 ebcdic-it-280+euro | *
Cp1145 | *IBM01145 | *CCSID01145 CP01145 ebcdic-es-284+euro | *
Cp1146 | *IBM01146 | *CCSID01146 CP01146 ebcdic-gb-285+euro | *
Cp1147 | *IBM01147 | *CCSID01147 CP01147 ebcdic-fr-297+euro | *
Cp1148 | *IBM01148 | *CCSID01148 CP01148 ebcdic-international-500+euro | *
Cp1149 | *IBM01149 | *CCSID01149 CP01149 ebcdic-is-871+euro | *
Cp1250 | *windows-1250 | ** |
Cp1251 | *windows-1251 | ** |
Cp1252 | *windows-1252 | ** |
Cp1253 | *windows-1253 | ** |
Cp1254 | *windows-1254 | ** |
Cp1255 | *windows-1255 | ** |
Cp1256 | *windows-1256 | ** |
Cp1257 | *windows-1257 | ** |
Cp1258 | *windows-1258 | ** |
Cp1381 | *? | ** |
Cp1383 | *? | ** |
Cp273 | *IBM273 | *csIBM273 | *
Cp277 | *IBM277 | *EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 | *
Cp278 | *IBM278 | *CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 | *
Cp280 | *IBM280 | *ebcdic-cp-it csIBM280 | *
Cp284 | *IBM284 | *ebcdic-cp-es csIBM284 | *
Cp285 | *IBM285 | *ebcdic-cp-gb csIBM285 | *
Cp297 | *IBM297 | *ebcdic-cp-fr csIBM297 | *
Cp33722 | *? | ** |
Cp420 | *IBM420 | *ebcdic-cp-ar1 csIBM420 | *
Cp424 | *IBM424 | *ebcdic-cp-he csIBM424 | *
Cp437 | *IBM437 | *437 csPC8CodePage437 | *
Cp500 | *IBM500 | *ebcdic-cp-be ebcdic-cp-ch csIBM500 | *
Cp737 | *? | ** |
Cp775 | *IBM775 | *csPC775Baltic | *
Cp838 | *IBM-Thai | ** |
Cp850 | *IBM850 | *850 csPC850Multilingual | *
Cp852 | *IBM852 | *852 csPCp852 | *
Cp855 | *IBM855 | *855 csIBM855 | *
Cp856 | *? | ** |
Cp857 | *IBM857 | *857 csIBM857 | *
Cp858 | *IBM00858 | *CCSID00858 CP00858 PC-Multilingual-850+euro | *
Cp860 | *IBM860 | *860 csIBM860 | *
Cp861 | *IBM861 | *861 cp-is csIBM861 | *
Cp862 | *IBM862 | *862 csPC862LatinHebrew | *
Cp863 | *IBM863 | *863 csIBM863 | *
Cp864 | *IBM864 | *cp864 csIBM864 | *
Cp865 | *IBM865 | *865 csIBM865 | *
Cp866 | *IBM866 | *866 csIBM866 | *
Cp868 | *IBM868 | *cp-ar csIBM868 | *
Cp869 | *IBM869 | *cp-gr csIBM869 | *
Cp870 | *IBM870 | *ebcdic-cp-roece ebcdic-cp-yu csIBM870 | *
Cp871 | *IBM871 | *ebcdic-cp-is csIBM871 | *
Cp875 | *? | ** |
Cp918 | *IBM918 | *ebcdic-cp-ar2 csIBM918 | *
Cp921 | *? | ** |
Cp922 | *? | ** |
Cp930 | *? | ** |
Cp933 | *? | ** |
Cp935 | *? | ** |
Cp937 | *? | ** |
Cp939 | *? | ** |
Cp942 | *? | ** |
Cp942C | *? | ** |
Cp943 | *? | ** |
Cp943C | *? | ** |
Cp948 | *? | ** |
Cp949 | *? | ** |
Cp949C | *? | ** |
Cp950 | *? | ** |
Cp964 | *? | ** |
Cp970 | *? | ** |
EUC_CN | *GB2312 | *x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165 | *
EUC_JP | *EUC-JP | *csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp | *
EUC_JP_LINUX | *? | ** |
EUC_JP_Solaris | *? | ** |
EUC_KR | *EUC-KR | *csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr | *
EUC_TW | *EUC-TW | *x-EUC-TW cns11643 euctw | *
GB18030 | *GB18030 | *gb18030-2000 | *
GBK | *windows-936 | *CP936 MS936 ms_936 x-mswin-936 | *
ISCII91 | *? | *x-ISCII91 iscii | *
ISO2022CN | *ISO-2022-CN | ** |
ISO2022JP | *ISO-2022-JP | *csISO2022JP JIS jis_encoding csjisencoding | *
ISO2022KR | *ISO-2022-KR | *csISO2022KR | *
ISO2022_CN_CNS | *? | ** |
ISO2022_CN_GB | *? | ** |
ISO8859_1 | *ISO-8859-1 | *ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1 | *
ISO8859_13 | *ISO-8859-13 | ** |
ISO8859_15 | *ISO-8859-15 | *ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS | *
ISO8859_2 | *ISO-8859-2 | *ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2 | *
ISO8859_3 | *ISO-8859-3 | *ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3 | *
ISO8859_4 | *ISO-8859-4 | *ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4 | *
ISO8859_5 | *ISO-8859-5 | *ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5 | *
ISO8859_6 | *ISO-8859-6 | *ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6 | *
ISO8859_7 | *ISO-8859-7 | *ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek | *
ISO8859_8 | *ISO-8859-8 | *ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8 | *
ISO8859_9 | *ISO-8859-9 | *ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9 | *
JISAutoDetect | *? | ** |
JIS_C6626-1983 | *JIS_C6626-1983 | *x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87 | *
JIS_X0201 | *JIS_X0201 | *X0201 JIS0201 csHalfWidthKatakana | *
JIS_X0212-1990 | *JIS_X0212-1990 | *iso-ir-159 x0212 JIS0212 csISO159JISX02121990 | *
KOI8_R | *KOI8-R | *csKOI8R koi8 | *
MS874 | *windows-874 | *cp874 | *
MS932 | *Windows-31J | *windows-932 csWindows31J x-ms-cp932 | *
MS949 | *windows-949 | *windows949 ms_949 x-windows-949 | *
MS950 | *windows-950 | *x-windows-950 | *
MS950_HKSCS | ** | * |
MacArabic | *? | ** |
MacCentralEurope | *? | ** |
MacCroatian | *? | ** |
MacCyrillic | *? | ** |
MacDingbat | *? | ** |
MacGreek | *MacGreek | ** |
MacHebrew | *? | ** |
MacIceland | *? | ** |
MacRoman | *MacRoman | *Macintosh MAC csMacintosh | *
MacRomania | *? | ** |
MacSymbol | *? | ** |
MacThai | *? | ** |
MacTurkish | *? | ** |
MacUkraine | *? | ** |
SJIS | *Shift_JIS | *MS_Kanji csShiftJIS shift-jis x-sjis pck | *
TIS620 | *TIS-620 | ** |
UTF-16 | *UTF-16 | *UTF_16 | *
UTF8 | *UTF-8 | ** |
UnicodeBig | *? | ** |
UnicodeBigUnmarked | *UTF-16BE | *X-UTF-16BE UTF_16BE ISO-10646-UCS-2 | *
UnicodeLittle | *? | ** |
UnicodeLittleUnmarked | *UTF-16LE | *UTF_16LE X-UTF-16LE | *
x-Johab | *johab | *johab cp1361 ms1361 ksc5601-1992 ksc5601_1992 | *
x-iso-8859-11 | *? | ** |
true
if the specified character is a whitespace
* character (CR, LF, SP or HT).
*
* ANDROID: COPIED FROM A NEWER VERSION OF MIME4J
*
* @param ch
* character to test.
* @return true
if the specified character is a whitespace
* character, false
otherwise.
*/
public static boolean isWhitespace(char ch) {
return ch == SP || ch == HT || ch == CR || ch == LF;
}
/**
* Returns true
if the specified string consists entirely of
* whitespace characters.
*
* ANDROID: COPIED FROM A NEWER VERSION OF MIME4J
*
* @param s
* string to test.
* @return true
if the specified string consists entirely of
* whitespace characters, false
otherwise.
*/
public static boolean isWhitespace(final String s) {
if (s == null) {
throw new IllegalArgumentException("String may not be null");
}
final int len = s.length();
for (int i = 0; i < len; i++) {
if (!isWhitespace(s.charAt(i))) {
return false;
}
}
return true;
}
/**
* Determines if the VM supports encoding (chars to bytes) the
* specified character set. NOTE: the given character set name may
* not be known to the VM even if this method returns true
.
* Use {@link #toJavaCharset(String)} to get the canonical Java character
* set name.
*
* @param charsetName the characters set name.
* @return true
if encoding is supported, false
* otherwise.
*/
public static boolean isEncodingSupported(String charsetName) {
return encodingSupported.contains(charsetName.toLowerCase(Locale.US));
}
/**
* Determines if the VM supports decoding (bytes to chars) the
* specified character set. NOTE: the given character set name may
* not be known to the VM even if this method returns true
.
* Use {@link #toJavaCharset(String)} to get the canonical Java character
* set name.
*
* @param charsetName the characters set name.
* @return true
if decoding is supported, false
* otherwise.
*/
public static boolean isDecodingSupported(String charsetName) {
return decodingSupported.contains(charsetName.toLowerCase(Locale.US));
}
/**
* Gets the preferred MIME character set name for the specified
* character set or null
if not known.
*
* @param charsetName the character set name to look for.
* @return the MIME preferred name or null
if not known.
*/
public static String toMimeCharset(String charsetName) {
Charset c = charsetMap.get(charsetName.toLowerCase(Locale.US));
if (c != null) {
return c.mime;
}
return null;
}
/**
* Gets the canonical Java character set name for the specified
* character set or null
if not known. This should be
* called before doing any conversions using the Java API. NOTE:
* you must use {@link #isEncodingSupported(String)} or
* {@link #isDecodingSupported(String)} to make sure the returned
* Java character set is supported by the current VM.
*
* @param charsetName the character set name to look for.
* @return the canonical Java name or null
if not known.
*/
public static String toJavaCharset(String charsetName) {
Charset c = charsetMap.get(charsetName.toLowerCase(Locale.US));
if (c != null) {
return c.canonical;
}
return null;
}
public static java.nio.charset.Charset getCharset(String charsetName) {
String defaultCharset = "ISO-8859-1";
// Use the default chareset if given charset is null
if(charsetName == null) charsetName = defaultCharset;
try {
return java.nio.charset.Charset.forName(charsetName);
} catch (IllegalCharsetNameException e) {
log.info("Illegal charset " + charsetName + ", fallback to " +
defaultCharset + ": " + e);
// Use default charset on exception
return java.nio.charset.Charset.forName(defaultCharset);
} catch (UnsupportedCharsetException ex) {
log.info("Unsupported charset " + charsetName + ", fallback to " +
defaultCharset + ": " + ex);
// Use default charset on exception
return java.nio.charset.Charset.forName(defaultCharset);
}
}
/*
* Uncomment the code below and run the main method to regenerate the
* Javadoc table above when the known charsets change.
*/
/*
private static String dumpHtmlTable() {
LinkedList l = new LinkedList(Arrays.asList(JAVA_CHARSETS));
Collections.sort(l);
StringBuffer sb = new StringBuffer();
sb.append(" * Canonical (Java) name | \n"); sb.append(" *MIME preferred | \n"); sb.append(" *Aliases | \n"); sb.append(" *
" + c.canonical + " | \n"); sb.append(" *" + (c.mime == null ? "?" : c.mime)+ " | \n"); sb.append(" *"); for (int i = 0; c.aliases != null && i < c.aliases.length; i++) { sb.append(c.aliases[i] + " "); } sb.append(" | \n"); sb.append(" *