1 /****************************************************************
2  * Licensed to the Apache Software Foundation (ASF) under one   *
3  * or more contributor license agreements.  See the NOTICE file *
4  * distributed with this work for additional information        *
5  * regarding copyright ownership.  The ASF licenses this file   *
6  * to you under the Apache License, Version 2.0 (the            *
7  * "License"); you may not use this file except in compliance   *
8  * with the License.  You may obtain a copy of the License at   *
9  *                                                              *
10  *   http://www.apache.org/licenses/LICENSE-2.0                 *
11  *                                                              *
12  * Unless required by applicable law or agreed to in writing,   *
13  * software distributed under the License is distributed on an  *
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15  * KIND, either express or implied.  See the License for the    *
16  * specific language governing permissions and limitations      *
17  * under the License.                                           *
18  ****************************************************************/
19 
20 package org.apache.james.mime4j.codec;
21 
22 import java.nio.ByteBuffer;
23 import java.nio.charset.Charset;
24 import java.util.BitSet;
25 import java.util.Locale;
26 
27 import org.apache.james.mime4j.util.CharsetUtil;
28 
29 /**
30  * ANDROID:  THIS CLASS IS COPIED FROM A NEWER VERSION OF MIME4J
31  */
32 
33 /**
34  * Static methods for encoding header field values. This includes encoded-words
35  * as defined in <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a>
36  * or display-names of an e-mail address, for example.
37  *
38  */
39 public class EncoderUtil {
40 
41     // This array is a lookup table that translates 6-bit positive integer index
42     // values into their "Base64 Alphabet" equivalents as specified in Table 1
43     // of RFC 2045.
44     // ANDROID:  THIS TABLE IS COPIED FROM BASE64OUTPUTSTREAM
45     static final byte[] BASE64_TABLE = { 'A', 'B', 'C', 'D', 'E', 'F',
46             'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
47             'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
48             'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
49             't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5',
50             '6', '7', '8', '9', '+', '/' };
51 
52     // Byte used to pad output.
53     private static final byte BASE64_PAD = '=';
54 
55     private static final BitSet Q_REGULAR_CHARS = initChars("=_?");
56 
57     private static final BitSet Q_RESTRICTED_CHARS = initChars("=_?\"#$%&'(),.:;<>@[\\]^`{|}~");
58 
59     private static final int MAX_USED_CHARACTERS = 50;
60 
61     private static final String ENC_WORD_PREFIX = "=?";
62     private static final String ENC_WORD_SUFFIX = "?=";
63 
64     private static final int ENCODED_WORD_MAX_LENGTH = 75; // RFC 2047
65 
66     private static final BitSet TOKEN_CHARS = initChars("()<>@,;:\\\"/[]?=");
67 
68     private static final BitSet ATEXT_CHARS = initChars("()<>@.,;:\\\"[]");
69 
initChars(String specials)70     private static BitSet initChars(String specials) {
71         BitSet bs = new BitSet(128);
72         for (char ch = 33; ch < 127; ch++) {
73             if (specials.indexOf(ch) == -1) {
74                 bs.set(ch);
75             }
76         }
77         return bs;
78     }
79 
80     /**
81      * Selects one of the two encodings specified in RFC 2047.
82      */
83     public enum Encoding {
84         /** The B encoding (identical to base64 defined in RFC 2045). */
85         B,
86         /** The Q encoding (similar to quoted-printable defined in RFC 2045). */
87         Q
88     }
89 
90     /**
91      * Indicates the intended usage of an encoded word.
92      */
93     public enum Usage {
94         /**
95          * Encoded word is used to replace a 'text' token in any Subject or
96          * Comments header field.
97          */
98         TEXT_TOKEN,
99         /**
100          * Encoded word is used to replace a 'word' entity within a 'phrase',
101          * for example, one that precedes an address in a From, To, or Cc
102          * header.
103          */
104         WORD_ENTITY
105     }
106 
EncoderUtil()107     private EncoderUtil() {
108     }
109 
110     /**
111      * Encodes the display-name portion of an address. See <a
112      * href='http://www.faqs.org/rfcs/rfc5322.html'>RFC 5322</a> section 3.4
113      * and <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> section
114      * 5.3. The specified string should not be folded.
115      *
116      * @param displayName
117      *            display-name to encode.
118      * @return encoded display-name.
119      */
encodeAddressDisplayName(String displayName)120     public static String encodeAddressDisplayName(String displayName) {
121         // display-name = phrase
122         // phrase = 1*( encoded-word / word )
123         // word = atom / quoted-string
124         // atom = [CFWS] 1*atext [CFWS]
125         // CFWS = comment or folding white space
126 
127         if (isAtomPhrase(displayName)) {
128             return displayName;
129         } else if (hasToBeEncoded(displayName, 0)) {
130             return encodeEncodedWord(displayName, Usage.WORD_ENTITY);
131         } else {
132             return quote(displayName);
133         }
134     }
135 
136     /**
137      * Encodes the local part of an address specification as described in RFC
138      * 5322 section 3.4.1. Leading and trailing CFWS should have been removed
139      * before calling this method. The specified string should not contain any
140      * illegal (control or non-ASCII) characters.
141      *
142      * @param localPart
143      *            the local part to encode
144      * @return the encoded local part.
145      */
encodeAddressLocalPart(String localPart)146     public static String encodeAddressLocalPart(String localPart) {
147         // local-part = dot-atom / quoted-string
148         // dot-atom = [CFWS] dot-atom-text [CFWS]
149         // CFWS = comment or folding white space
150 
151         if (isDotAtomText(localPart)) {
152             return localPart;
153         } else {
154             return quote(localPart);
155         }
156     }
157 
158     /**
159      * Encodes the specified strings into a header parameter as described in RFC
160      * 2045 section 5.1 and RFC 2183 section 2. The specified strings should not
161      * contain any illegal (control or non-ASCII) characters.
162      *
163      * @param name
164      *            parameter name.
165      * @param value
166      *            parameter value.
167      * @return encoded result.
168      */
encodeHeaderParameter(String name, String value)169     public static String encodeHeaderParameter(String name, String value) {
170         name = name.toLowerCase(Locale.US);
171 
172         // value := token / quoted-string
173         if (isToken(value)) {
174             return name + "=" + value;
175         } else {
176             return name + "=" + quote(value);
177         }
178     }
179 
180     /**
181      * Shortcut method that encodes the specified text into an encoded-word if
182      * the text has to be encoded.
183      *
184      * @param text
185      *            text to encode.
186      * @param usage
187      *            whether the encoded-word is to be used to replace a text token
188      *            or a word entity (see RFC 822).
189      * @param usedCharacters
190      *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
191      * @return the specified text if encoding is not necessary or an encoded
192      *         word or a sequence of encoded words otherwise.
193      */
encodeIfNecessary(String text, Usage usage, int usedCharacters)194     public static String encodeIfNecessary(String text, Usage usage,
195             int usedCharacters) {
196         if (hasToBeEncoded(text, usedCharacters))
197             return encodeEncodedWord(text, usage, usedCharacters);
198         else
199             return text;
200     }
201 
202     /**
203      * Determines if the specified string has to encoded into an encoded-word.
204      * Returns <code>true</code> if the text contains characters that don't
205      * fall into the printable ASCII character set or if the text contains a
206      * 'word' (sequence of non-whitespace characters) longer than 77 characters
207      * (including characters already used up in the line).
208      *
209      * @param text
210      *            text to analyze.
211      * @param usedCharacters
212      *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
213      * @return <code>true</code> if the specified text has to be encoded into
214      *         an encoded-word, <code>false</code> otherwise.
215      */
hasToBeEncoded(String text, int usedCharacters)216     public static boolean hasToBeEncoded(String text, int usedCharacters) {
217         if (text == null)
218             throw new IllegalArgumentException();
219         if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
220             throw new IllegalArgumentException();
221 
222         int nonWhiteSpaceCount = usedCharacters;
223 
224         for (int idx = 0; idx < text.length(); idx++) {
225             char ch = text.charAt(idx);
226             if (ch == '\t' || ch == ' ') {
227                 nonWhiteSpaceCount = 0;
228             } else {
229                 nonWhiteSpaceCount++;
230                 if (nonWhiteSpaceCount > 77) {
231                     // Line cannot be folded into multiple lines with no more
232                     // than 78 characters each. Encoding as encoded-words makes
233                     // that possible. One character has to be reserved for
234                     // folding white space; that leaves 77 characters.
235                     return true;
236                 }
237 
238                 if (ch < 32 || ch >= 127) {
239                     // non-printable ascii character has to be encoded
240                     return true;
241                 }
242             }
243         }
244 
245         return false;
246     }
247 
248     /**
249      * Encodes the specified text into an encoded word or a sequence of encoded
250      * words separated by space. The text is separated into a sequence of
251      * encoded words if it does not fit in a single one.
252      * <p>
253      * The charset to encode the specified text into a byte array and the
254      * encoding to use for the encoded-word are detected automatically.
255      * <p>
256      * This method assumes that zero characters have already been used up in the
257      * current line.
258      *
259      * @param text
260      *            text to encode.
261      * @param usage
262      *            whether the encoded-word is to be used to replace a text token
263      *            or a word entity (see RFC 822).
264      * @return the encoded word (or sequence of encoded words if the given text
265      *         does not fit in a single encoded word).
266      * @see #hasToBeEncoded(String, int)
267      */
encodeEncodedWord(String text, Usage usage)268     public static String encodeEncodedWord(String text, Usage usage) {
269         return encodeEncodedWord(text, usage, 0, null, null);
270     }
271 
272     /**
273      * Encodes the specified text into an encoded word or a sequence of encoded
274      * words separated by space. The text is separated into a sequence of
275      * encoded words if it does not fit in a single one.
276      * <p>
277      * The charset to encode the specified text into a byte array and the
278      * encoding to use for the encoded-word are detected automatically.
279      *
280      * @param text
281      *            text to encode.
282      * @param usage
283      *            whether the encoded-word is to be used to replace a text token
284      *            or a word entity (see RFC 822).
285      * @param usedCharacters
286      *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
287      * @return the encoded word (or sequence of encoded words if the given text
288      *         does not fit in a single encoded word).
289      * @see #hasToBeEncoded(String, int)
290      */
encodeEncodedWord(String text, Usage usage, int usedCharacters)291     public static String encodeEncodedWord(String text, Usage usage,
292             int usedCharacters) {
293         return encodeEncodedWord(text, usage, usedCharacters, null, null);
294     }
295 
296     /**
297      * Encodes the specified text into an encoded word or a sequence of encoded
298      * words separated by space. The text is separated into a sequence of
299      * encoded words if it does not fit in a single one.
300      *
301      * @param text
302      *            text to encode.
303      * @param usage
304      *            whether the encoded-word is to be used to replace a text token
305      *            or a word entity (see RFC 822).
306      * @param usedCharacters
307      *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
308      * @param charset
309      *            the Java charset that should be used to encode the specified
310      *            string into a byte array. A suitable charset is detected
311      *            automatically if this parameter is <code>null</code>.
312      * @param encoding
313      *            the encoding to use for the encoded-word (either B or Q). A
314      *            suitable encoding is automatically chosen if this parameter is
315      *            <code>null</code>.
316      * @return the encoded word (or sequence of encoded words if the given text
317      *         does not fit in a single encoded word).
318      * @see #hasToBeEncoded(String, int)
319      */
encodeEncodedWord(String text, Usage usage, int usedCharacters, Charset charset, Encoding encoding)320     public static String encodeEncodedWord(String text, Usage usage,
321             int usedCharacters, Charset charset, Encoding encoding) {
322         if (text == null)
323             throw new IllegalArgumentException();
324         if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
325             throw new IllegalArgumentException();
326 
327         if (charset == null)
328             charset = determineCharset(text);
329 
330         String mimeCharset = CharsetUtil.toMimeCharset(charset.name());
331         if (mimeCharset == null) {
332             // cannot happen if charset was originally null
333             throw new IllegalArgumentException("Unsupported charset");
334         }
335 
336         byte[] bytes = encode(text, charset);
337 
338         if (encoding == null)
339             encoding = determineEncoding(bytes, usage);
340 
341         if (encoding == Encoding.B) {
342             String prefix = ENC_WORD_PREFIX + mimeCharset + "?B?";
343             return encodeB(prefix, text, usedCharacters, charset, bytes);
344         } else {
345             String prefix = ENC_WORD_PREFIX + mimeCharset + "?Q?";
346             return encodeQ(prefix, text, usage, usedCharacters, charset, bytes);
347         }
348     }
349 
350     /**
351      * Encodes the specified byte array using the B encoding defined in RFC
352      * 2047.
353      *
354      * @param bytes
355      *            byte array to encode.
356      * @return encoded string.
357      */
encodeB(byte[] bytes)358     public static String encodeB(byte[] bytes) {
359         StringBuilder sb = new StringBuilder();
360 
361         int idx = 0;
362         final int end = bytes.length;
363         for (; idx < end - 2; idx += 3) {
364             int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8
365                     | bytes[idx + 2] & 0xff;
366             sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
367             sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
368             sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
369             sb.append((char) BASE64_TABLE[data & 0x3f]);
370         }
371 
372         if (idx == end - 2) {
373             int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8;
374             sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
375             sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
376             sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
377             sb.append((char) BASE64_PAD);
378 
379         } else if (idx == end - 1) {
380             int data = (bytes[idx] & 0xff) << 16;
381             sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
382             sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
383             sb.append((char) BASE64_PAD);
384             sb.append((char) BASE64_PAD);
385         }
386 
387         return sb.toString();
388     }
389 
390     /**
391      * Encodes the specified byte array using the Q encoding defined in RFC
392      * 2047.
393      *
394      * @param bytes
395      *            byte array to encode.
396      * @param usage
397      *            whether the encoded-word is to be used to replace a text token
398      *            or a word entity (see RFC 822).
399      * @return encoded string.
400      */
encodeQ(byte[] bytes, Usage usage)401     public static String encodeQ(byte[] bytes, Usage usage) {
402         BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
403                 : Q_RESTRICTED_CHARS;
404 
405         StringBuilder sb = new StringBuilder();
406 
407         final int end = bytes.length;
408         for (int idx = 0; idx < end; idx++) {
409             int v = bytes[idx] & 0xff;
410             if (v == 32) {
411                 sb.append('_');
412             } else if (!qChars.get(v)) {
413                 sb.append('=');
414                 sb.append(hexDigit(v >>> 4));
415                 sb.append(hexDigit(v & 0xf));
416             } else {
417                 sb.append((char) v);
418             }
419         }
420 
421         return sb.toString();
422     }
423 
424     /**
425      * Tests whether the specified string is a token as defined in RFC 2045
426      * section 5.1.
427      *
428      * @param str
429      *            string to test.
430      * @return <code>true</code> if the specified string is a RFC 2045 token,
431      *         <code>false</code> otherwise.
432      */
isToken(String str)433     public static boolean isToken(String str) {
434         // token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
435         // tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" /
436         // <"> / "/" / "[" / "]" / "?" / "="
437         // CTL := 0.- 31., 127.
438 
439         final int length = str.length();
440         if (length == 0)
441             return false;
442 
443         for (int idx = 0; idx < length; idx++) {
444             char ch = str.charAt(idx);
445             if (!TOKEN_CHARS.get(ch))
446                 return false;
447         }
448 
449         return true;
450     }
451 
isAtomPhrase(String str)452     private static boolean isAtomPhrase(String str) {
453         // atom = [CFWS] 1*atext [CFWS]
454 
455         boolean containsAText = false;
456 
457         final int length = str.length();
458         for (int idx = 0; idx < length; idx++) {
459             char ch = str.charAt(idx);
460             if (ATEXT_CHARS.get(ch)) {
461                 containsAText = true;
462             } else if (!CharsetUtil.isWhitespace(ch)) {
463                 return false;
464             }
465         }
466 
467         return containsAText;
468     }
469 
470     // RFC 5322 section 3.2.3
isDotAtomText(String str)471     private static boolean isDotAtomText(String str) {
472         // dot-atom-text = 1*atext *("." 1*atext)
473         // atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" /
474         // "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~"
475 
476         char prev = '.';
477 
478         final int length = str.length();
479         if (length == 0)
480             return false;
481 
482         for (int idx = 0; idx < length; idx++) {
483             char ch = str.charAt(idx);
484 
485             if (ch == '.') {
486                 if (prev == '.' || idx == length - 1)
487                     return false;
488             } else {
489                 if (!ATEXT_CHARS.get(ch))
490                     return false;
491             }
492 
493             prev = ch;
494         }
495 
496         return true;
497     }
498 
499     // RFC 5322 section 3.2.4
quote(String str)500     private static String quote(String str) {
501         // quoted-string = [CFWS] DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS]
502         // qcontent = qtext / quoted-pair
503         // qtext = %d33 / %d35-91 / %d93-126
504         // quoted-pair = ("\" (VCHAR / WSP))
505         // VCHAR = %x21-7E
506         // DQUOTE = %x22
507 
508         String escaped = str.replaceAll("[\\\\\"]", "\\\\$0");
509         return "\"" + escaped + "\"";
510     }
511 
encodeB(String prefix, String text, int usedCharacters, Charset charset, byte[] bytes)512     private static String encodeB(String prefix, String text,
513             int usedCharacters, Charset charset, byte[] bytes) {
514         int encodedLength = bEncodedLength(bytes);
515 
516         int totalLength = prefix.length() + encodedLength
517                 + ENC_WORD_SUFFIX.length();
518         if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
519             return prefix + encodeB(bytes) + ENC_WORD_SUFFIX;
520         } else {
521             int splitOffset = text.offsetByCodePoints(text.length() / 2, -1);
522 
523             String part1 = text.substring(0, splitOffset);
524             byte[] bytes1 = encode(part1, charset);
525             String word1 = encodeB(prefix, part1, usedCharacters, charset,
526                     bytes1);
527 
528             String part2 = text.substring(splitOffset);
529             byte[] bytes2 = encode(part2, charset);
530             String word2 = encodeB(prefix, part2, 0, charset, bytes2);
531 
532             return word1 + " " + word2;
533         }
534     }
535 
bEncodedLength(byte[] bytes)536     private static int bEncodedLength(byte[] bytes) {
537         return (bytes.length + 2) / 3 * 4;
538     }
539 
encodeQ(String prefix, String text, Usage usage, int usedCharacters, Charset charset, byte[] bytes)540     private static String encodeQ(String prefix, String text, Usage usage,
541             int usedCharacters, Charset charset, byte[] bytes) {
542         int encodedLength = qEncodedLength(bytes, usage);
543 
544         int totalLength = prefix.length() + encodedLength
545                 + ENC_WORD_SUFFIX.length();
546         if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
547             return prefix + encodeQ(bytes, usage) + ENC_WORD_SUFFIX;
548         } else {
549             int splitOffset = text.offsetByCodePoints(text.length() / 2, -1);
550 
551             String part1 = text.substring(0, splitOffset);
552             byte[] bytes1 = encode(part1, charset);
553             String word1 = encodeQ(prefix, part1, usage, usedCharacters,
554                     charset, bytes1);
555 
556             String part2 = text.substring(splitOffset);
557             byte[] bytes2 = encode(part2, charset);
558             String word2 = encodeQ(prefix, part2, usage, 0, charset, bytes2);
559 
560             return word1 + " " + word2;
561         }
562     }
563 
qEncodedLength(byte[] bytes, Usage usage)564     private static int qEncodedLength(byte[] bytes, Usage usage) {
565         BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
566                 : Q_RESTRICTED_CHARS;
567 
568         int count = 0;
569 
570         for (int idx = 0; idx < bytes.length; idx++) {
571             int v = bytes[idx] & 0xff;
572             if (v == 32) {
573                 count++;
574             } else if (!qChars.get(v)) {
575                 count += 3;
576             } else {
577                 count++;
578             }
579         }
580 
581         return count;
582     }
583 
encode(String text, Charset charset)584     private static byte[] encode(String text, Charset charset) {
585         ByteBuffer buffer = charset.encode(text);
586         byte[] bytes = new byte[buffer.limit()];
587         buffer.get(bytes);
588         return bytes;
589     }
590 
determineCharset(String text)591     private static Charset determineCharset(String text) {
592         // it is an important property of iso-8859-1 that it directly maps
593         // unicode code points 0000 to 00ff to byte values 00 to ff.
594         boolean ascii = true;
595         final int len = text.length();
596         for (int index = 0; index < len; index++) {
597             char ch = text.charAt(index);
598             if (ch > 0xff) {
599                 return CharsetUtil.UTF_8;
600             }
601             if (ch > 0x7f) {
602                 ascii = false;
603             }
604         }
605         return ascii ? CharsetUtil.US_ASCII : CharsetUtil.ISO_8859_1;
606     }
607 
determineEncoding(byte[] bytes, Usage usage)608     private static Encoding determineEncoding(byte[] bytes, Usage usage) {
609         if (bytes.length == 0)
610             return Encoding.Q;
611 
612         BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
613                 : Q_RESTRICTED_CHARS;
614 
615         int qEncoded = 0;
616         for (int i = 0; i < bytes.length; i++) {
617             int v = bytes[i] & 0xff;
618             if (v != 32 && !qChars.get(v)) {
619                 qEncoded++;
620             }
621         }
622 
623         int percentage = qEncoded * 100 / bytes.length;
624         return percentage > 30 ? Encoding.B : Encoding.Q;
625     }
626 
hexDigit(int i)627     private static char hexDigit(int i) {
628         return i < 10 ? (char) (i + '0') : (char) (i - 10 + 'A');
629     }
630 }
631